{ "cells": [ { "cell_type": "markdown", "id": "8cfb37b5", "metadata": {}, "source": [ "# 4omini query" ] }, { "cell_type": "code", "execution_count": 1, "id": "2a8e89d4", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/site-packages/PyPDF2/__init__.py:21: DeprecationWarning: PyPDF2 is deprecated. Please move to the pypdf library instead.\n", " warnings.warn(\n" ] } ], "source": [ "import os\n", "\n", "from dotenv import load_dotenv\n", "\n", "from evoagentx.agents.agent_manager import AgentManager\n", "from evoagentx.benchmark import HotPotQA\n", "from evoagentx.core.callbacks import suppress_logger_info\n", "from evoagentx.core.logging import logger\n", "from evoagentx.evaluators import Evaluator\n", "from evoagentx.models import OpenAILLM, OpenAILLMConfig\n", "from evoagentx.optimizers import TextGradOptimizer\n", "from evoagentx.prompts import StringTemplate\n", "from evoagentx.workflow import SequentialWorkFlowGraph\n", "from dotenv import load_dotenv\n", "\n", "from evoagentx.agents.agent_manager import AgentManager\n", "from evoagentx.benchmark import MBPP\n", "from evoagentx.core.callbacks import suppress_logger_info\n", "from evoagentx.core.logging import logger\n", "from evoagentx.evaluators import Evaluator\n", "from evoagentx.models import OpenAILLM, OpenAILLMConfig\n", "from evoagentx.optimizers import TextGradOptimizer\n", "from evoagentx.prompts import StringTemplate\n", "from evoagentx.workflow import SequentialWorkFlowGraph\n", "\n", "from evoagentx.models import OpenAILLMConfig, OpenAILLM\n", "from evoagentx.workflow import SEWWorkFlowGraph, STRUCTUREWorkFlowGraph\n", "from evoagentx.agents import AgentManager\n", "from evoagentx.benchmark import HumanEval,AFlowMBPP\n", "from evoagentx.evaluators import Evaluator \n", "from evoagentx.optimizers import SEWOptimizer, STRUCTUREOptimizer\n", "from evoagentx.optimizers.structure_optimizer import STRUCTUREWorkFlowScheme\n", "from evoagentx.core.callbacks import suppress_logger_info\n", "\n", "from evoagentx.models import OpenAILLMConfig, OpenAILLM,AzureOpenAIConfig,LiteLLMConfig,LiteLLM\n", "from evoagentx.workflow import SEWWorkFlowGraph \n", "from evoagentx.agents import AgentManager\n", "from evoagentx.benchmark import MBPPPLUS, AFlowMBPPPLUS\n", "from evoagentx.evaluators import Evaluator \n", "from evoagentx.optimizers import SEWOptimizer \n", "from evoagentx.core.callbacks import suppress_logger_info\n", "from evoagentx.benchmark import HumanEvalPLUS\n", "from evoagentx.benchmark import SciCode\n", "from evoagentx.benchmark import PubMedQA\n", "from copy import deepcopy\n", "\n", "import nest_asyncio\n", "nest_asyncio.apply()" ] }, { "cell_type": "code", "execution_count": 2, "id": "54fa1aa5", "metadata": {}, "outputs": [], "source": [ "class PubMedQASplits(PubMedQA):\n", "\n", " def _load_data(self):\n", " # load the original test data \n", " super()._load_data()\n", " # split the data into train, dev and test\n", " import numpy as np \n", " np.random.seed(42)\n", " permutation = np.random.permutation(len(self._dev_data))\n", " full_test_data = self._dev_data \n", " # randomly select 10 samples for train, 40 for dev, and 100 for test\n", " self._train_data = [full_test_data[idx] for idx in permutation[:50]]\n", " self._dev_data = [full_test_data[idx] for idx in permutation[:50]]\n", " self._test_data =self._test_data[0:500]\n", " self._fulldata = full_test_data\n", "\n", "\n", "def collate_func(example: dict) -> dict:\n", " context_list = []\n", " paragraphs = example[\"context\"][\"contexts\"]\n", " context = \"\\n\".join(paragraphs)\n", " problem = \"Context: {}\\n\\nQuestion: {}\\n\\nAnswer:\".format(context, example[\"question\"])\n", " return {\"problem\": problem}\n", "\n", "\n", "hotpotqa_graph_data = {\n", " \"goal\": \"Answer the question based on the context. The answer should be a direct response to the question, without including explanations or reasoning.\",\n", " \"tasks\": [\n", " {\n", " \"name\": \"answer_generate\",\n", " \"description\": \"Answer the question based on the context.\",\n", " \"inputs\": [\n", " {\"name\": \"problem\", \"type\": \"str\", \"required\": True, \"description\": \"The problem to solve.\"}\n", " ],\n", " \"outputs\": [\n", " {\"name\": \"answer\", \"type\": \"str\", \"required\": True, \"description\": \"The answer to the problem.\"}\n", " ],\n", " \"prompt_template\": StringTemplate(instruction=\"Think step by step to answer the question. You should explain your thinking process in the 'thought' field, and provide the final answer in the 'answer' field.\\nFormat your output in xml format, such as xxx and xxx.\"),\n", " \"parse_mode\": \"xml\"\n", " }\n", " ] \n", "}" ] }, { "cell_type": "code", "execution_count": 3, "id": "1ebace55", "metadata": {}, "outputs": [], "source": [ "os.environ[\"AZURE_OPENAI_DEPLOYMENT_NAME\"] = \"gpt-4o-mini\"\n", "os.environ[\"AZURE_OPENAI_ENDPOINT\"] = \"\"\n", "os.environ[\"AZURE_OPENAI_KEY\"] = \"\"\n", "os.environ[\"AZURE_OPENAI_API_VERSION\"] = \"2025-01-01-preview\"\n", "llm_config = LiteLLMConfig(model=\"azure/\" + os.getenv(\"AZURE_OPENAI_DEPLOYMENT_NAME\"), # Azure model format\n", " azure_endpoint=os.getenv(\"AZURE_OPENAI_ENDPOINT\"),\n", " azure_key=os.getenv(\"AZURE_OPENAI_KEY\"),\n", " api_version=os.getenv(\"AZURE_OPENAI_API_VERSION\", \"2024-12-01-preview\"), top_p=0.85, temperature=0.2, frequency_penalty=0.0, presence_penalty=0.0)\n", "\n", "executor_llm = LiteLLM(config=llm_config)\n", "optimizer_llm = LiteLLM(config=llm_config)" ] }, { "cell_type": "code", "execution_count": 4, "id": "20e078fa", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 11:53:15.783\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.benchmark.hotpotqa\u001b[0m:\u001b[36m_load_data_from_file\u001b[0m:\u001b[36m51\u001b[0m - \u001b[1mloading HotPotQA data from /gpfs/radev/home/tl688/.evoagentx/data/hotpotqa/hotpot_train_v1.1.json ...\u001b[0m\n", "\u001b[32m2025-12-07 11:53:19.353\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.benchmark.hotpotqa\u001b[0m:\u001b[36m_load_data_from_file\u001b[0m:\u001b[36m51\u001b[0m - \u001b[1mloading HotPotQA data from /gpfs/radev/home/tl688/.evoagentx/data/hotpotqa/hotpot_dev_distractor_v1.json ...\u001b[0m\n" ] } ], "source": [ "benchmark = PubMedQASplits()\n", "workflow_graph = SequentialWorkFlowGraph.from_dict(hotpotqa_graph_data)\n", "agent_manager = AgentManager()\n", "agent_manager.add_agents_from_workflow(workflow_graph, executor_llm.config)\n", "\n", "evaluator = Evaluator(\n", " llm=executor_llm, \n", " agent_manager=agent_manager, \n", " collate_func=collate_func, \n", " num_workers=20, \n", " verbose=True\n", ")\n", "\n", "textgrad_optimizer = TextGradOptimizer(\n", " graph=workflow_graph, \n", " optimize_mode=\"all\",\n", " executor_llm=executor_llm, \n", " optimizer_llm=optimizer_llm,\n", " batch_size=3,\n", " max_steps=20,\n", " evaluator=evaluator,\n", " eval_every_n_steps=1,\n", " eval_rounds=1,\n", " save_interval=None,\n", " save_path=\"./\",\n", " rollback=True,\n", " constraints=[]\n", ")\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "78d5904e", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "7405" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "logger.info(\"Evaluating workflow on test set...\")\n", "with suppress_logger_info():\n", " results = textgrad_optimizer.evaluate(dataset=benchmark, eval_mode=\"test\")\n", "logger.info(f\"Evaluation metrics (before optimization): {results}\")" ] }, { "cell_type": "markdown", "id": "5e0b4cc8", "metadata": {}, "source": [ "# textgrad (prompt)" ] }, { "cell_type": "code", "execution_count": 6, "id": "d686ee20", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 11:53:19.708\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m1\u001b[0m - \u001b[1mEvaluating workflow on test set...\u001b[0m\n", "Evaluating workflow: 0%| | 1/500 [00:01<12:07, 1.46s/it]Task exception was never retrieved\n", "future: exception=RuntimeError('Event loop is closed')>\n", "Traceback (most recent call last):\n", " File \"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/asyncio/tasks.py\", line 277, in __step\n", " result = coro.send(None)\n", " ^^^^^^^^^^^^^^^\n", " File \"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/site-packages/litellm/utils.py\", line 873, in _client_async_logging_helper\n", " GLOBAL_LOGGING_WORKER.ensure_initialized_and_enqueue(\n", " File \"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/site-packages/litellm/litellm_core_utils/logging_worker.py\", line 322, in ensure_initialized_and_enqueue\n", " self.enqueue(async_coroutine)\n", " File \"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/site-packages/litellm/litellm_core_utils/logging_worker.py\", line 131, in enqueue\n", " self._queue.put_nowait(task)\n", " File \"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/asyncio/queues.py\", line 147, in put_nowait\n", " self._wakeup_next(self._getters)\n", " File \"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/asyncio/queues.py\", line 63, in _wakeup_next\n", " waiter.set_result(None)\n", " File \"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/asyncio/futures.py\", line 263, in set_result\n", " self.__schedule_callbacks()\n", " File \"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/asyncio/futures.py\", line 173, in __schedule_callbacks\n", " self._loop.call_soon(callback, self, context=ctx)\n", " File \"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/asyncio/base_events.py\", line 762, in call_soon\n", " self._check_closed()\n", " File \"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/asyncio/base_events.py\", line 520, in _check_closed\n", " raise RuntimeError('Event loop is closed')\n", "RuntimeError: Event loop is closed\n", "Evaluating workflow: 0%| | 2/500 [00:01<05:36, 1.48it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 2%|▏ | 8/500 [00:01<01:05, 7.46it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.75, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.3333333333333333, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 2%|▏ | 12/500 [00:02<00:50, 9.59it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.18181818181818182, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 4%|▎ | 18/500 [00:02<00:29, 16.34it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.5454545454545454, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 4%|▍ | 21/500 [00:03<00:59, 8.12it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 5%|▌ | 26/500 [00:03<00:45, 10.40it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 6%|▌ | 28/500 [00:03<00:46, 10.09it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.3333333333333333, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 6%|▋ | 32/500 [00:04<00:41, 11.23it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 7%|▋ | 37/500 [00:04<00:36, 12.61it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.7272727272727273, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.5833333333333334, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 8%|▊ | 39/500 [00:04<00:33, 13.63it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.3076923076923077, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 8%|▊ | 41/500 [00:04<00:40, 11.20it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 9%|▊ | 43/500 [00:05<00:53, 8.49it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.13333333333333333, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 9%|▉ | 46/500 [00:05<00:41, 10.82it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 10%|▉ | 48/500 [00:05<00:51, 8.71it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 10%|█ | 51/500 [00:06<00:55, 8.15it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 11%|█ | 56/500 [00:06<00:48, 9.12it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.4444444444444445, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 12%|█▏ | 58/500 [00:06<00:45, 9.66it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 12%|█▏ | 60/500 [00:07<00:44, 9.96it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 13%|█▎ | 64/500 [00:07<00:40, 10.67it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.08695652173913045, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 13%|█▎ | 67/500 [00:07<00:53, 8.05it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 14%|█▍ | 71/500 [00:08<00:48, 8.93it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 15%|█▍ | 74/500 [00:08<00:41, 10.20it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 15%|█▌ | 76/500 [00:08<00:38, 10.95it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.03333333333333333, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 16%|█▌ | 79/500 [00:09<00:49, 8.43it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.30769230769230765, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.16666666666666669, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 16%|█▌ | 81/500 [01:00<47:48, 6.85s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 17%|█▋ | 87/500 [01:00<18:50, 2.74s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 18%|█▊ | 89/500 [01:01<14:30, 2.12s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 0.125, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 19%|█▊ | 93/500 [01:01<08:11, 1.21s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 19%|█▉ | 97/500 [01:02<05:14, 1.28it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.4, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 20%|█▉ | 99/500 [01:02<04:09, 1.60it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 20%|██ | 101/500 [01:02<03:23, 1.96it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.19999999999999998, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.27272727272727276, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 21%|██ | 106/500 [01:02<01:50, 3.57it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.4, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 22%|██▏ | 108/500 [01:03<01:36, 4.08it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 22%|██▏ | 110/500 [01:03<01:48, 3.61it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5454545454545454, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 22%|██▏ | 112/500 [01:04<01:29, 4.32it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.9090909090909091, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 23%|██▎ | 116/500 [01:04<01:02, 6.16it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.04081632653061225, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.4615384615384615, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 24%|██▍ | 121/500 [01:04<00:37, 10.06it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.3076923076923077, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 25%|██▍ | 124/500 [01:04<00:33, 11.21it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.7368421052631579, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 25%|██▌ | 126/500 [01:05<00:38, 9.67it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.18181818181818182, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 26%|██▌ | 130/500 [01:05<00:40, 9.03it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.7777777777777778, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 27%|██▋ | 136/500 [01:06<00:45, 8.04it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 29%|██▉ | 146/500 [01:06<00:21, 16.60it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 30%|██▉ | 149/500 [01:07<00:44, 7.96it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.42857142857142855, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 31%|███▏ | 157/500 [01:08<00:32, 10.71it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.21052631578947367, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 32%|███▏ | 162/500 [01:08<00:26, 12.70it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 34%|███▎ | 168/500 [01:09<00:33, 10.01it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.1111111111111111, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 34%|███▍ | 171/500 [01:10<00:41, 7.98it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 35%|███▍ | 173/500 [02:00<27:24, 5.03s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 35%|███▌ | 175/500 [02:01<19:41, 3.64s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 35%|███▌ | 177/500 [02:01<14:02, 2.61s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 36%|███▋ | 182/500 [02:01<06:13, 1.17s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.625, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 37%|███▋ | 184/500 [02:01<04:39, 1.13it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.25, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.19999999999999998, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.38095238095238093, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 37%|███▋ | 186/500 [02:02<03:33, 1.47it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.11764705882352941, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 38%|███▊ | 188/500 [02:02<03:08, 1.66it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.14285714285714288, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 38%|███▊ | 192/500 [02:03<01:42, 3.00it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 0.3076923076923077, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 39%|███▉ | 197/500 [02:03<00:56, 5.33it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 40%|███▉ | 199/500 [02:03<00:51, 5.84it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 41%|████ | 204/500 [02:04<00:46, 6.35it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.16666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 41%|████ | 206/500 [02:04<00:49, 5.97it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 42%|████▏ | 209/500 [02:05<00:37, 7.83it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.4444444444444445, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 43%|████▎ | 215/500 [02:05<00:25, 11.00it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.19999999999999998, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 43%|████▎ | 217/500 [02:05<00:24, 11.57it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.7777777777777778, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 44%|████▍ | 219/500 [02:06<00:48, 5.81it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 45%|████▍ | 224/500 [02:06<00:36, 7.64it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.21428571428571425, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 46%|████▌ | 229/500 [02:07<00:23, 11.34it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6153846153846153, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.15384615384615385, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 46%|████▋ | 232/500 [02:07<00:24, 11.11it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 47%|████▋ | 234/500 [02:07<00:25, 10.48it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 47%|████▋ | 236/500 [02:07<00:23, 11.36it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 48%|████▊ | 238/500 [02:08<00:28, 9.23it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 49%|████▊ | 243/500 [02:08<00:24, 10.29it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 50%|████▉ | 248/500 [02:09<00:21, 11.69it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 50%|█████ | 250/500 [02:09<00:29, 8.54it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 51%|█████ | 254/500 [02:09<00:27, 9.01it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.15384615384615385, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.5454545454545454, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.25, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 51%|█████▏ | 257/500 [02:10<00:25, 9.69it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 52%|█████▏ | 259/500 [02:10<00:27, 8.90it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.3076923076923077, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 53%|█████▎ | 263/500 [02:10<00:25, 9.45it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.16666666666666669, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 53%|█████▎ | 265/500 [02:11<00:31, 7.46it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.13333333333333336, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 54%|█████▍ | 269/500 [03:01<19:09, 4.98s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5806451612903226, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 55%|█████▍ | 274/500 [03:01<08:46, 2.33s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8571428571428571, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 55%|█████▌ | 277/500 [03:01<05:37, 1.51s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 56%|█████▌ | 279/500 [03:01<04:11, 1.14s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 56%|█████▌ | 281/500 [03:02<03:10, 1.15it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 57%|█████▋ | 286/500 [03:03<01:45, 2.03it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.9090909090909091, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.23529411764705882, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 58%|█████▊ | 288/500 [03:03<01:22, 2.57it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 59%|█████▉ | 294/500 [03:03<00:43, 4.69it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.5454545454545454, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 11:56:23.691\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 59%|█████▉ | 296/500 [03:03<00:35, 5.75it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 60%|█████▉ | 298/500 [03:04<00:33, 6.00it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 60%|██████ | 301/500 [03:05<00:44, 4.48it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 61%|██████ | 304/500 [03:05<00:29, 6.73it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.4, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 62%|██████▏ | 308/500 [03:05<00:22, 8.51it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.25, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 62%|██████▏ | 310/500 [03:05<00:18, 10.14it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 63%|██████▎ | 313/500 [03:06<00:17, 10.57it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.3571428571428571, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 63%|██████▎ | 317/500 [03:07<00:29, 6.20it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 64%|██████▍ | 322/500 [03:07<00:16, 10.63it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 66%|██████▌ | 330/500 [03:07<00:11, 15.34it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 67%|██████▋ | 333/500 [03:08<00:21, 7.64it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 67%|██████▋ | 335/500 [03:08<00:22, 7.36it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 68%|██████▊ | 340/500 [03:09<00:16, 9.92it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.4, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.5454545454545454, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 68%|██████▊ | 342/500 [03:09<00:16, 9.44it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 69%|██████▉ | 344/500 [03:09<00:18, 8.43it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 69%|██████▉ | 346/500 [03:10<00:20, 7.35it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 70%|██████▉ | 348/500 [03:10<00:20, 7.40it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 70%|███████ | 350/500 [03:10<00:16, 9.19it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.3333333333333333, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 70%|███████ | 352/500 [03:10<00:16, 9.12it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.04444444444444445, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 71%|███████ | 354/500 [03:11<00:18, 7.75it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 71%|███████▏ | 357/500 [03:11<00:16, 8.42it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 72%|███████▏ | 360/500 [04:01<16:37, 7.12s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.75, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 73%|███████▎ | 364/500 [04:01<08:13, 3.63s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.07142857142857144, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 74%|███████▍ | 369/500 [04:01<03:32, 1.62s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8571428571428571, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6363636363636364, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.21052631578947367, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 74%|███████▍ | 371/500 [04:02<02:36, 1.21s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 75%|███████▍ | 374/500 [04:02<01:43, 1.21it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.35294117647058826, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 75%|███████▌ | 376/500 [04:02<01:21, 1.51it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 76%|███████▌ | 378/500 [04:03<01:05, 1.87it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.4444444444444445, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 76%|███████▌ | 381/500 [04:03<00:37, 3.16it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.10526315789473684, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 77%|███████▋ | 385/500 [04:03<00:22, 5.02it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.4444444444444445, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 78%|███████▊ | 388/500 [04:04<00:15, 7.08it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.11764705882352941, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 78%|███████▊ | 390/500 [04:04<00:16, 6.65it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 78%|███████▊ | 392/500 [04:05<00:24, 4.37it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 79%|███████▉ | 396/500 [04:05<00:16, 6.17it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 80%|████████ | 401/500 [04:06<00:10, 9.80it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.18181818181818182, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 81%|████████ | 403/500 [04:06<00:08, 10.89it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.35714285714285715, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.5454545454545454, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 81%|████████ | 405/500 [04:06<00:15, 6.26it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 81%|████████▏ | 407/500 [04:07<00:16, 5.55it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 82%|████████▏ | 410/500 [04:07<00:14, 6.39it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 82%|████████▏ | 412/500 [04:07<00:11, 7.65it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.3, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 84%|████████▎ | 418/500 [04:08<00:07, 11.59it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.3333333333333333, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 84%|████████▍ | 420/500 [04:08<00:06, 11.50it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 84%|████████▍ | 422/500 [04:08<00:07, 10.03it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 85%|████████▍ | 424/500 [04:08<00:07, 9.70it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 85%|████████▌ | 427/500 [04:09<00:08, 8.27it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.8571428571428571, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 86%|████████▌ | 431/500 [04:09<00:06, 10.99it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8571428571428571, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 87%|████████▋ | 433/500 [04:09<00:06, 10.66it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.19999999999999998, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 87%|████████▋ | 435/500 [04:09<00:06, 9.54it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.4444444444444445, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 87%|████████▋ | 437/500 [04:10<00:07, 8.77it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 88%|████████▊ | 439/500 [04:10<00:09, 6.30it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.25, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 89%|████████▊ | 443/500 [04:11<00:05, 9.79it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.4444444444444445, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 89%|████████▉ | 446/500 [04:11<00:04, 12.03it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5454545454545454, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 90%|████████▉ | 448/500 [04:11<00:03, 13.40it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8235294117647058, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 90%|█████████ | 450/500 [04:11<00:04, 10.95it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6153846153846153, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 90%|█████████ | 452/500 [04:12<00:07, 6.28it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 91%|█████████ | 453/500 [05:01<06:40, 8.52s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 91%|█████████ | 456/500 [05:01<03:37, 4.94s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.5263157894736842, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 92%|█████████▏| 460/500 [05:01<01:42, 2.55s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 92%|█████████▏| 462/500 [05:02<01:09, 1.83s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 93%|█████████▎| 464/500 [05:02<00:49, 1.37s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 93%|█████████▎| 466/500 [05:02<00:32, 1.06it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.06060606060606061, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.4444444444444445, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 93%|█████████▎| 467/500 [05:02<00:25, 1.31it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 94%|█████████▍| 470/500 [05:03<00:12, 2.33it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.47619047619047616, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 95%|█████████▍| 474/500 [05:03<00:05, 4.61it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 95%|█████████▌| 476/500 [05:03<00:04, 4.95it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Evaluating workflow: 95%|█████████▌| 477/500 [05:04<00:05, 3.99it/s]Unclosed connector\n", "connections: ['deque([(, 9706833.339199008)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706832.608262513)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706832.58293831)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706832.98561138)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706833.144348238)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706832.568526758)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706832.190116027)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706833.246411916)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706832.970390048)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706832.728774056)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706832.971580995)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706832.543152127)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706832.332917776)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706833.070988659)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706832.820581889)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706833.061071675)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706832.660261652)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706833.057310568)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706832.320419114)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706832.92792528)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706835.039624516)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706834.41924051)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706834.0620211)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706834.30129984)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706833.969782745)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706834.279454239)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706834.86199866)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9706834.566667313)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706834.202076651)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706835.574714107)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706834.32608822)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706834.86027172)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706834.79403196)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706835.198638653)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706835.32135346)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706835.276575617)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706835.312936094)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706837.385317784)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706837.561538259)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706835.83374123)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706835.09928204)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706837.475442462)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706835.954519052)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706834.543440253)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706834.713191785)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706836.011249008)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706834.854768094)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706836.67486628)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706839.152896648)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706836.7468186)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706836.870078135)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706837.21862246)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706837.747099983)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706837.275274444)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706836.94819115)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9706836.904656712)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706838.03730312)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706836.460972665)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706835.972691856)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706839.449210517)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706836.10574446)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706836.37345917)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706838.474045642)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706838.115622288)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706839.94052132)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706839.0674185)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706838.622524388)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706839.301349184)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706838.950606762)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706838.991567569)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706838.686984628)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706839.088006405)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706838.171331778)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706840.300554968)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706837.778763738)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706837.9880991)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706837.726972347)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706839.455664787)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706891.81647973)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706891.618730936)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706891.844060568)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706892.169079188)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706892.218102802)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706892.234274939)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706891.299922405)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706892.004405266)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706891.662652526)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706891.752106642)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706891.530107627)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706892.14507622)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706839.511774916)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706839.382643469)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706891.59241538)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706892.11316638)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706893.20912134)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706893.597493544)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706893.468325477)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706891.538457343)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706893.612794887)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706891.583362848)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706893.15275196)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706895.051218217)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706892.989315974)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706893.670760855)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706893.771038584)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706893.899447188)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706894.489606064)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706895.193371318)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706895.141026651)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706895.375255115)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706894.667291349)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706893.546501007)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706895.36791289)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706893.579615466)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706893.602766184)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706894.959430479)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706894.772316653)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706894.853254467)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706895.775585068)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706895.870579008)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706895.385935564)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706896.18071492)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706895.919915168)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706897.523545437)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706897.542247811)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706896.28712492)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706897.36445807)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706896.372668212)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706895.226458225)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706896.615855856)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706895.205519568)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706895.245286625)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706895.258316396)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706895.56308174)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706897.320896113)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706897.351739809)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706897.522463731)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706897.620288484)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706899.311815055)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706897.22144762)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706897.38648522)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706897.422608884)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706897.35692558)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706897.624923551)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706899.270264504)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706897.787571924)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706898.597296853)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706898.21663628)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706897.252509214)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706897.27237713)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706897.591074271)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706898.824499112)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706899.48631274)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706899.161758523)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706898.772059185)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706899.149813682)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706899.55342984)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706899.12129742)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706899.145715035)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706900.211053928)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706899.493575294)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706899.317977337)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706899.226675492)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706899.540468458)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706899.251080165)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706899.120114308)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706900.142985068)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706900.376150131)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9706900.893129088)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706900.309064128)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706951.514310287)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706951.666614879)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706900.380748404)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706952.491656585)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706952.77399658)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706952.466608366)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706952.186743349)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706951.78028774)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706952.15023707)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706951.386311997)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706951.821540905)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706952.244626231)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706952.493340295)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706952.887397302)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706953.917681018)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706954.037982732)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706954.414413815)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706952.100112706)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706952.29696486)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706951.924534544)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706954.0561336)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706953.9124161)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706955.021882122)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706954.186679348)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706955.18034006)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706955.131624991)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706954.004607322)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706954.267817756)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706954.494096704)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706955.47808753)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706955.738085156)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706955.614886004)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706954.438011425)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706955.5773319)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706953.978842732)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706953.869536418)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706953.993282996)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706953.598444892)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706956.01996488)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706956.614400096)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706956.27564206)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706956.219060805)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706956.118222432)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706955.626830677)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706957.803666467)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706956.071917487)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706955.749512635)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706956.12974136)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706957.12941382)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707012.127974983)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706957.61095996)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706957.917122157)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706955.788781)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706957.532948257)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706957.948422918)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706957.78977187)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706957.90019276)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706958.297801595)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706957.849599527)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706958.883673597)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706958.197755555)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706958.422785893)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706958.55615796)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706958.897687992)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706958.5159005)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706958.866540704)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706958.006056024)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706957.511496436)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706957.27701366)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706957.65962196)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706959.22304688)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706959.760654412)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706960.624880794)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706959.67607496)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706959.337902091)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706960.689177232)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706959.619400395)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706959.640248217)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706959.689775456)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706960.895299762)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706961.140954588)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706959.961921368)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706960.447211832)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9706960.17586264)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706958.958266724)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706959.205803515)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706960.493160894)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706961.975405212)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707011.896137992)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706961.171805464)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706961.21040519)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706961.48917723)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707012.606304312)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706961.729020454)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707012.009015003)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706961.566066192)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707012.351488996)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706961.367075708)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9706960.685592448)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707012.830895804)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707011.946912743)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9706960.541973488)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707012.329214538)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707012.527127981)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707011.963500287)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707013.137438115)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707012.883915376)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707014.56732526)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707014.082582053)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707012.297758264)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707011.86388307)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9707012.142070236)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707011.840448232)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707012.494120672)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707015.0131243)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707013.913426297)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707013.804967696)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707014.955209276)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707013.723037789)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707013.714598317)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707014.154930271)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707014.200178895)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707014.367389085)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707014.897337627)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707014.549903441)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707016.208176317)])']\n", "connector: \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 96%|█████████▋| 482/500 [05:04<00:02, 7.14it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 97%|█████████▋| 484/500 [05:05<00:02, 6.68it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 97%|█████████▋| 486/500 [05:05<00:02, 4.85it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.05, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 98%|█████████▊| 488/500 [05:05<00:01, 6.16it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 98%|█████████▊| 492/500 [05:06<00:00, 9.41it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 99%|█████████▉| 497/500 [05:06<00:00, 12.46it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.10526315789473684, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 100%|█████████▉| 499/500 [05:06<00:00, 13.82it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 100%|██████████| 500/500 [05:07<00:00, 1.63it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 11:58:27.150\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m4\u001b[0m - \u001b[1mEvaluation metrics (before optimization): {'f1': 0.6132507992007615, 'em': 0.438, 'acc': 0.656}\u001b[0m\n", "\u001b[32m2025-12-07 11:58:27.150\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m6\u001b[0m - \u001b[1mOptimizing workflow...\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", " 0%| | 0/20 [00:00.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 12%|█▏ | 6/50 [00:01<00:08, 5.43it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 20%|██ | 10/50 [00:01<00:04, 9.06it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.08333333333333333, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 28%|██▊ | 14/50 [00:01<00:02, 13.25it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 34%|███▍ | 17/50 [00:02<00:02, 11.90it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.47058823529411764, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 38%|███▊ | 19/50 [00:02<00:02, 10.48it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 42%|████▏ | 21/50 [00:03<00:04, 5.91it/s]\u001b[A\n", "Evaluating workflow: 46%|████▌ | 23/50 [00:03<00:03, 7.00it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 60%|██████ | 30/50 [00:03<00:01, 13.54it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.888888888888889, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 11:59:24.428\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a72a2935542991f9a20c546', 'answer': 'Velvetpark', 'question': 'Is Velvetpark or Shape magazine written more for a lesbian and queer-identified female readership?', 'supporting_facts': [['Velvetpark', 1], ['Shape (magazine)', 1], ['Shape (magazine)', 7]], 'context': [['Jeguk Sinmun', ['The Jeguk Sinmun (\"Imperial Post\"; 1898-1910) was a Seoul-based Korean language newspaper founded in 1898 by Yi Jong-myeon.', ' It was published using the purely vernacular Hangeul script and attracted a largely lower or middle class and female readership.', ' It was less political than the other papers of the period, concentrating instead on social issues.', ' One of its early reporters was the young Syngman Rhee.']], ['Velvetpark', ['Velvetpark: Dyke Culture in Bloom is a lesbian and feminist arts and culture website that regularly features music, literature, theater, fine arts, film, television, and social activism as it impacts queer culture.', ' \"Velvetpark\" also hosts a social network and dating community for lesbians and queer-identified women.']], ['Anna Kalata', ['Anna Kalata (born May 10, 1964, Milanówek, Poland) is a Polish politician, celebrity and occasional actress.', ' She was a member of the populist Samoobrona party.', \" In Jarosław Kaczyński's cabinet she was the minister of labour and social policy.\", ' She participated in the 12th season of Taniec z Gwiazdami (the Polish version of Dancing With The Stars).', ' After losing 38 kg she appeared on the cover of Shape magazine.']], ['Shape (magazine)', [\"Shape is a monthly English language fitness magazine started by Weider Publications in 1981, founded by Christine MacIntyre (a pioneer in women's free weight fitness) and became the number one women's fitness magazine.\", ' At that time, Weider Enterprises consisted primarily of the bodybuilding magazine \"Muscle & Fitness\".', ' Joe Weider and Christine MacIntyre had differing views of how to present \"Shape\", Weider endorsing a less journalistic and more commercial approach to articles, MacIntyre endorsing a more academic, doctor-based magazine.', ' Weider also endorsed a sexier approach to editorial while MacIntyre endorsed a healthier look for women, eschewing sexiness in the models and the copy.', ' MacIntyre largely won that battle, editing a magazine that required that every byline have an advanced medical degree, that cover models should look healthy rather than sexy, and that sexist language be avoided.', ' Christine MacIntyre was the editor-in-chief until her death in 1988.', ' Tara Kraft is the current editor-in-chief.', ' \"Shape\" found a readership based on that formula.']], ['Cynthia Heimel', ['Cynthia Heimel (née Glick) (born 1947 in Philadelphia) is a feminist humorist writer from Oakland, California.', ' She is a columnist and the author of satirical books primarily aimed at a female readership and known for their unusual titles, as well as a playwright and television writer.']], ['Femme', ['Femme is a lesbian sexual identity that was created in the working class lesbian bar culture of the 1950s.', ' It is a term used to distinguish feminine lesbian and bisexual women from their butch/stud lesbian counterparts and partners.', ' Today the term is still used in this way but in recent years - following the influence of Queer gender identity theories - its meaning has, sometimes contentiously, been expanded to describe a queer-identified person who is feminine in their presentation regardless of their gender or sexuality.']], ['Chapstick lesbian', ['A chapstick lesbian is a sub-group within lesbianism that Ellen DeGeneres popularised in 1997 in her show \"Ellen\".', ' It was originally constructed as response to the phrase \"lipstick lesbian\" that emerged in 1990, which refers to a femme lesbian who emphasises their female identity through their self-presentation.', ' The slang term \"chapstick lesbian\" identifies a category on the femme-butch lesbian continuum, where the female homosexual has a gender identity bias towards femme lesbianism, although does not identify or fit the criteria of being a lipstick lesbian.', ' The word is frequently used as an alternative to the term \"soft-butch\" lesbian or androgynous.', ' The key attributes recognisable of a chapstick lesbian is that they have a casual dress-code and lack of desire to wear make-up.', ' Next to this, they are also viewed as being athletic in nature and have a notable interest in sport.']], ['Elana Amsterdam', ['Elana Amsterdam is the New York Times Bestselling author of \"Paleo Cooking from Elana\\'s Pantry\".', ' She writes cookbooks for gluten-free cooking, using almond flour and coconut flour as a gluten-free alternative to wheat flour.', ' Her book, \"The Gluten-Free Almond Flour Cookbook\", was named one of the \"Best Cookbooks of 2009\" by The Denver Post.', ' Amsterdam has partnered with the California Almond Board in conjunction with her works.', ' Her blog, elanaspantry.com, was named one of the top 50 food blogs by Cision.', \" Amsterdam contributed an article to Shape Magazine and she was featured on Fox News's On the Hunt with Jonathan Hunt.\"]], ['Shōjo manga', ['Shōjo, shojo, or shoujo manga (少女漫画 , shōjo manga ) is manga aimed at a teenage female readership.', ' The name romanizes the Japanese 少女 (shōjo), literally \"young woman\".', ' Shōjo manga covers many subjects in a variety of narrative styles, from historical drama to science fiction, often with a focus on romantic relationships or emotions.', ' Strictly speaking, however, shōjo manga does not comprise a style or genre, but rather indicates a target demographic.']], ['Celesbian', ['The term celesbian (a portmanteau of \"celebrity\" and \"lesbian\") originally referred to a female celebrity known or reputed to be a lesbian and popular within the LGBT community.', ' Celesbianism as a Western media phenomenon came into vogue in 2008, when several female celebrities presented themselves as lesbians.', ' The term was first used by New Yorkers Pam Franco and Susan Levine, a disk jockey.', ' It was used in a full-page ad in a lesbian nightlife magazine, \"GO MAGAZINE\".', ' The ad was for the Mz Hip and Fit NY contest, the idea of Denise Cohen of Denco Designs & Events.', ' The contest was a search for the hottest lesbian in the United States.', ' The term \"celesbian\" was used for the celebrity lesbian judges.']]], 'type': 'comparison', 'level': 'hard'}\n", "Error: The input to LLMOutputParser.parse should be a str, but found .\u001b[0m\n", "metrics {'f1': 0.19354838709677416, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 66%|██████▌ | 33/50 [00:04<00:01, 9.67it/s]\u001b[A\n", "Evaluating workflow: 72%|███████▏ | 36/50 [00:04<00:01, 10.76it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.046511627906976744, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 76%|███████▌ | 38/50 [00:04<00:01, 9.81it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 82%|████████▏ | 41/50 [00:04<00:00, 11.62it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 86%|████████▌ | 43/50 [00:05<00:00, 10.94it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 90%|█████████ | 45/50 [00:05<00:00, 10.14it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 94%|█████████▍| 47/50 [00:05<00:00, 9.16it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 98%|█████████▊| 49/50 [00:05<00:00, 9.33it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.18181818181818182, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 100%|██████████| 50/50 [00:11<00:00, 4.42it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 11:59:32.026\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m275\u001b[0m - \u001b[1mStep 1 metrics: {'f1': 0.6432127596637934, 'em': 0.4375, 'acc': 0.75}\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "\r", " 5%|▌ | 1/20 [01:04<20:32, 64.87s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 11:59:32.027\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m319\u001b[0m - \u001b[1mExecuting workflow...\u001b[0m\n", "\u001b[32m2025-12-07 11:59:42.538\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m347\u001b[0m - \u001b[1mComputing gradients...\u001b[0m\n", "\u001b[32m2025-12-07 12:00:26.760\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m349\u001b[0m - \u001b[1mUpdating agents...\u001b[0m\n", "\u001b[32m2025-12-07 12:00:32.233\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m353\u001b[0m - \u001b[1mAgents updated\u001b[0m\n", "\u001b[32m2025-12-07 12:00:32.234\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m271\u001b[0m - \u001b[1mEvaluating the workflow at step 2 ...\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 0%| | 0/50 [00:00.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 8%|▊ | 4/50 [00:01<00:14, 3.13it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 14%|█▍ | 7/50 [00:01<00:07, 5.84it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 20%|██ | 10/50 [00:01<00:04, 8.85it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 28%|██▊ | 14/50 [00:02<00:02, 12.71it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 34%|███▍ | 17/50 [00:02<00:02, 14.81it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.7272727272727273, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 40%|████ | 20/50 [00:02<00:02, 10.91it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 44%|████▍ | 22/50 [00:03<00:03, 7.88it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 48%|████▊ | 24/50 [00:03<00:03, 8.06it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 52%|█████▏ | 26/50 [00:03<00:02, 8.11it/s]\u001b[A\n", "Evaluating workflow: 60%|██████ | 30/50 [00:03<00:01, 12.14it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.888888888888889, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 64%|██████▍ | 32/50 [00:04<00:01, 9.41it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:00:36.599\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a72a2935542991f9a20c546', 'answer': 'Velvetpark', 'question': 'Is Velvetpark or Shape magazine written more for a lesbian and queer-identified female readership?', 'supporting_facts': [['Velvetpark', 1], ['Shape (magazine)', 1], ['Shape (magazine)', 7]], 'context': [['Jeguk Sinmun', ['The Jeguk Sinmun (\"Imperial Post\"; 1898-1910) was a Seoul-based Korean language newspaper founded in 1898 by Yi Jong-myeon.', ' It was published using the purely vernacular Hangeul script and attracted a largely lower or middle class and female readership.', ' It was less political than the other papers of the period, concentrating instead on social issues.', ' One of its early reporters was the young Syngman Rhee.']], ['Velvetpark', ['Velvetpark: Dyke Culture in Bloom is a lesbian and feminist arts and culture website that regularly features music, literature, theater, fine arts, film, television, and social activism as it impacts queer culture.', ' \"Velvetpark\" also hosts a social network and dating community for lesbians and queer-identified women.']], ['Anna Kalata', ['Anna Kalata (born May 10, 1964, Milanówek, Poland) is a Polish politician, celebrity and occasional actress.', ' She was a member of the populist Samoobrona party.', \" In Jarosław Kaczyński's cabinet she was the minister of labour and social policy.\", ' She participated in the 12th season of Taniec z Gwiazdami (the Polish version of Dancing With The Stars).', ' After losing 38 kg she appeared on the cover of Shape magazine.']], ['Shape (magazine)', [\"Shape is a monthly English language fitness magazine started by Weider Publications in 1981, founded by Christine MacIntyre (a pioneer in women's free weight fitness) and became the number one women's fitness magazine.\", ' At that time, Weider Enterprises consisted primarily of the bodybuilding magazine \"Muscle & Fitness\".', ' Joe Weider and Christine MacIntyre had differing views of how to present \"Shape\", Weider endorsing a less journalistic and more commercial approach to articles, MacIntyre endorsing a more academic, doctor-based magazine.', ' Weider also endorsed a sexier approach to editorial while MacIntyre endorsed a healthier look for women, eschewing sexiness in the models and the copy.', ' MacIntyre largely won that battle, editing a magazine that required that every byline have an advanced medical degree, that cover models should look healthy rather than sexy, and that sexist language be avoided.', ' Christine MacIntyre was the editor-in-chief until her death in 1988.', ' Tara Kraft is the current editor-in-chief.', ' \"Shape\" found a readership based on that formula.']], ['Cynthia Heimel', ['Cynthia Heimel (née Glick) (born 1947 in Philadelphia) is a feminist humorist writer from Oakland, California.', ' She is a columnist and the author of satirical books primarily aimed at a female readership and known for their unusual titles, as well as a playwright and television writer.']], ['Femme', ['Femme is a lesbian sexual identity that was created in the working class lesbian bar culture of the 1950s.', ' It is a term used to distinguish feminine lesbian and bisexual women from their butch/stud lesbian counterparts and partners.', ' Today the term is still used in this way but in recent years - following the influence of Queer gender identity theories - its meaning has, sometimes contentiously, been expanded to describe a queer-identified person who is feminine in their presentation regardless of their gender or sexuality.']], ['Chapstick lesbian', ['A chapstick lesbian is a sub-group within lesbianism that Ellen DeGeneres popularised in 1997 in her show \"Ellen\".', ' It was originally constructed as response to the phrase \"lipstick lesbian\" that emerged in 1990, which refers to a femme lesbian who emphasises their female identity through their self-presentation.', ' The slang term \"chapstick lesbian\" identifies a category on the femme-butch lesbian continuum, where the female homosexual has a gender identity bias towards femme lesbianism, although does not identify or fit the criteria of being a lipstick lesbian.', ' The word is frequently used as an alternative to the term \"soft-butch\" lesbian or androgynous.', ' The key attributes recognisable of a chapstick lesbian is that they have a casual dress-code and lack of desire to wear make-up.', ' Next to this, they are also viewed as being athletic in nature and have a notable interest in sport.']], ['Elana Amsterdam', ['Elana Amsterdam is the New York Times Bestselling author of \"Paleo Cooking from Elana\\'s Pantry\".', ' She writes cookbooks for gluten-free cooking, using almond flour and coconut flour as a gluten-free alternative to wheat flour.', ' Her book, \"The Gluten-Free Almond Flour Cookbook\", was named one of the \"Best Cookbooks of 2009\" by The Denver Post.', ' Amsterdam has partnered with the California Almond Board in conjunction with her works.', ' Her blog, elanaspantry.com, was named one of the top 50 food blogs by Cision.', \" Amsterdam contributed an article to Shape Magazine and she was featured on Fox News's On the Hunt with Jonathan Hunt.\"]], ['Shōjo manga', ['Shōjo, shojo, or shoujo manga (少女漫画 , shōjo manga ) is manga aimed at a teenage female readership.', ' The name romanizes the Japanese 少女 (shōjo), literally \"young woman\".', ' Shōjo manga covers many subjects in a variety of narrative styles, from historical drama to science fiction, often with a focus on romantic relationships or emotions.', ' Strictly speaking, however, shōjo manga does not comprise a style or genre, but rather indicates a target demographic.']], ['Celesbian', ['The term celesbian (a portmanteau of \"celebrity\" and \"lesbian\") originally referred to a female celebrity known or reputed to be a lesbian and popular within the LGBT community.', ' Celesbianism as a Western media phenomenon came into vogue in 2008, when several female celebrities presented themselves as lesbians.', ' The term was first used by New Yorkers Pam Franco and Susan Levine, a disk jockey.', ' It was used in a full-page ad in a lesbian nightlife magazine, \"GO MAGAZINE\".', ' The ad was for the Mz Hip and Fit NY contest, the idea of Denise Cohen of Denco Designs & Events.', ' The contest was a search for the hottest lesbian in the United States.', ' The term \"celesbian\" was used for the celebrity lesbian judges.']]], 'type': 'comparison', 'level': 'hard'}\n", "Error: The input to LLMOutputParser.parse should be a str, but found .\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 68%|██████▊ | 34/50 [00:04<00:01, 8.91it/s]\u001b[A\n", "Evaluating workflow: 72%|███████▏ | 36/50 [00:04<00:01, 9.54it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.0392156862745098, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 76%|███████▌ | 38/50 [00:04<00:01, 10.98it/s]\u001b[A\n", "Evaluating workflow: 80%|████████ | 40/50 [00:04<00:00, 11.87it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 86%|████████▌ | 43/50 [00:04<00:00, 15.26it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.19354838709677416, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 90%|█████████ | 45/50 [00:05<00:00, 11.23it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 100%|██████████| 50/50 [00:05<00:00, 8.90it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.17391304347826084, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:00:37.961\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m275\u001b[0m - \u001b[1mStep 2 metrics: {'f1': 0.7166464418583675, 'em': 0.5416666666666666, 'acc': 0.7708333333333334}\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "\r", " 10%|█ | 2/20 [02:10<19:38, 65.50s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:00:37.963\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m319\u001b[0m - \u001b[1mExecuting workflow...\u001b[0m\n", "\u001b[32m2025-12-07 12:00:48.602\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m347\u001b[0m - \u001b[1mComputing gradients...\u001b[0m\n", "\u001b[32m2025-12-07 12:01:35.948\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m349\u001b[0m - \u001b[1mUpdating agents...\u001b[0m\n", "\u001b[32m2025-12-07 12:01:40.905\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m353\u001b[0m - \u001b[1mAgents updated\u001b[0m\n", "\u001b[32m2025-12-07 12:01:40.906\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m271\u001b[0m - \u001b[1mEvaluating the workflow at step 3 ...\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 0%| | 0/50 [00:00.\u001b[0m\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 20%|██ | 10/50 [00:02<00:04, 8.12it/s]\u001b[A\n", "Evaluating workflow: 26%|██▌ | 13/50 [00:02<00:03, 10.80it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.08333333333333333, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.14285714285714288, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 34%|███▍ | 17/50 [00:02<00:02, 13.72it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.7272727272727273, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 40%|████ | 20/50 [00:02<00:03, 9.22it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 44%|████▍ | 22/50 [00:03<00:03, 7.56it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 50%|█████ | 25/50 [00:03<00:02, 9.81it/s]\u001b[A\n", "Evaluating workflow: 54%|█████▍ | 27/50 [00:03<00:02, 10.34it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.888888888888889, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 62%|██████▏ | 31/50 [00:03<00:01, 12.42it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:01:44.934\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a72a2935542991f9a20c546', 'answer': 'Velvetpark', 'question': 'Is Velvetpark or Shape magazine written more for a lesbian and queer-identified female readership?', 'supporting_facts': [['Velvetpark', 1], ['Shape (magazine)', 1], ['Shape (magazine)', 7]], 'context': [['Jeguk Sinmun', ['The Jeguk Sinmun (\"Imperial Post\"; 1898-1910) was a Seoul-based Korean language newspaper founded in 1898 by Yi Jong-myeon.', ' It was published using the purely vernacular Hangeul script and attracted a largely lower or middle class and female readership.', ' It was less political than the other papers of the period, concentrating instead on social issues.', ' One of its early reporters was the young Syngman Rhee.']], ['Velvetpark', ['Velvetpark: Dyke Culture in Bloom is a lesbian and feminist arts and culture website that regularly features music, literature, theater, fine arts, film, television, and social activism as it impacts queer culture.', ' \"Velvetpark\" also hosts a social network and dating community for lesbians and queer-identified women.']], ['Anna Kalata', ['Anna Kalata (born May 10, 1964, Milanówek, Poland) is a Polish politician, celebrity and occasional actress.', ' She was a member of the populist Samoobrona party.', \" In Jarosław Kaczyński's cabinet she was the minister of labour and social policy.\", ' She participated in the 12th season of Taniec z Gwiazdami (the Polish version of Dancing With The Stars).', ' After losing 38 kg she appeared on the cover of Shape magazine.']], ['Shape (magazine)', [\"Shape is a monthly English language fitness magazine started by Weider Publications in 1981, founded by Christine MacIntyre (a pioneer in women's free weight fitness) and became the number one women's fitness magazine.\", ' At that time, Weider Enterprises consisted primarily of the bodybuilding magazine \"Muscle & Fitness\".', ' Joe Weider and Christine MacIntyre had differing views of how to present \"Shape\", Weider endorsing a less journalistic and more commercial approach to articles, MacIntyre endorsing a more academic, doctor-based magazine.', ' Weider also endorsed a sexier approach to editorial while MacIntyre endorsed a healthier look for women, eschewing sexiness in the models and the copy.', ' MacIntyre largely won that battle, editing a magazine that required that every byline have an advanced medical degree, that cover models should look healthy rather than sexy, and that sexist language be avoided.', ' Christine MacIntyre was the editor-in-chief until her death in 1988.', ' Tara Kraft is the current editor-in-chief.', ' \"Shape\" found a readership based on that formula.']], ['Cynthia Heimel', ['Cynthia Heimel (née Glick) (born 1947 in Philadelphia) is a feminist humorist writer from Oakland, California.', ' She is a columnist and the author of satirical books primarily aimed at a female readership and known for their unusual titles, as well as a playwright and television writer.']], ['Femme', ['Femme is a lesbian sexual identity that was created in the working class lesbian bar culture of the 1950s.', ' It is a term used to distinguish feminine lesbian and bisexual women from their butch/stud lesbian counterparts and partners.', ' Today the term is still used in this way but in recent years - following the influence of Queer gender identity theories - its meaning has, sometimes contentiously, been expanded to describe a queer-identified person who is feminine in their presentation regardless of their gender or sexuality.']], ['Chapstick lesbian', ['A chapstick lesbian is a sub-group within lesbianism that Ellen DeGeneres popularised in 1997 in her show \"Ellen\".', ' It was originally constructed as response to the phrase \"lipstick lesbian\" that emerged in 1990, which refers to a femme lesbian who emphasises their female identity through their self-presentation.', ' The slang term \"chapstick lesbian\" identifies a category on the femme-butch lesbian continuum, where the female homosexual has a gender identity bias towards femme lesbianism, although does not identify or fit the criteria of being a lipstick lesbian.', ' The word is frequently used as an alternative to the term \"soft-butch\" lesbian or androgynous.', ' The key attributes recognisable of a chapstick lesbian is that they have a casual dress-code and lack of desire to wear make-up.', ' Next to this, they are also viewed as being athletic in nature and have a notable interest in sport.']], ['Elana Amsterdam', ['Elana Amsterdam is the New York Times Bestselling author of \"Paleo Cooking from Elana\\'s Pantry\".', ' She writes cookbooks for gluten-free cooking, using almond flour and coconut flour as a gluten-free alternative to wheat flour.', ' Her book, \"The Gluten-Free Almond Flour Cookbook\", was named one of the \"Best Cookbooks of 2009\" by The Denver Post.', ' Amsterdam has partnered with the California Almond Board in conjunction with her works.', ' Her blog, elanaspantry.com, was named one of the top 50 food blogs by Cision.', \" Amsterdam contributed an article to Shape Magazine and she was featured on Fox News's On the Hunt with Jonathan Hunt.\"]], ['Shōjo manga', ['Shōjo, shojo, or shoujo manga (少女漫画 , shōjo manga ) is manga aimed at a teenage female readership.', ' The name romanizes the Japanese 少女 (shōjo), literally \"young woman\".', ' Shōjo manga covers many subjects in a variety of narrative styles, from historical drama to science fiction, often with a focus on romantic relationships or emotions.', ' Strictly speaking, however, shōjo manga does not comprise a style or genre, but rather indicates a target demographic.']], ['Celesbian', ['The term celesbian (a portmanteau of \"celebrity\" and \"lesbian\") originally referred to a female celebrity known or reputed to be a lesbian and popular within the LGBT community.', ' Celesbianism as a Western media phenomenon came into vogue in 2008, when several female celebrities presented themselves as lesbians.', ' The term was first used by New Yorkers Pam Franco and Susan Levine, a disk jockey.', ' It was used in a full-page ad in a lesbian nightlife magazine, \"GO MAGAZINE\".', ' The ad was for the Mz Hip and Fit NY contest, the idea of Denise Cohen of Denco Designs & Events.', ' The contest was a search for the hottest lesbian in the United States.', ' The term \"celesbian\" was used for the celebrity lesbian judges.']]], 'type': 'comparison', 'level': 'hard'}\n", "Error: The input to LLMOutputParser.parse should be a str, but found .\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 66%|██████▌ | 33/50 [00:03<00:01, 13.31it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.05405405405405406, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 70%|███████ | 35/50 [00:04<00:01, 12.27it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.19354838709677416, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 74%|███████▍ | 37/50 [00:04<00:01, 7.81it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 78%|███████▊ | 39/50 [00:04<00:01, 8.14it/s]\u001b[A\n", "Evaluating workflow: 82%|████████▏ | 41/50 [00:05<00:01, 8.59it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.16666666666666669, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[A\n", "Evaluating workflow: 90%|█████████ | 45/50 [00:05<00:00, 12.52it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 94%|█████████▍| 47/50 [00:05<00:00, 8.69it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 100%|██████████| 50/50 [00:06<00:00, 7.96it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.23529411764705882, 'em': 0.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:01:47.293\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m275\u001b[0m - \u001b[1mStep 3 metrics: {'f1': 0.6768156961053071, 'em': 0.4583333333333333, 'acc': 0.75}\u001b[0m\n", "\u001b[32m2025-12-07 12:01:47.294\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m291\u001b[0m - \u001b[1mMetrics are worse than the best snapshot which has {'f1': 0.7166464418583675, 'em': 0.5416666666666666, 'acc': 0.7708333333333334}. Rolling back to the best snapshot.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "\r", " 15%|█▌ | 3/20 [03:20<19:03, 67.25s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:01:47.298\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m319\u001b[0m - \u001b[1mExecuting workflow...\u001b[0m\n", "\u001b[32m2025-12-07 12:01:58.088\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m347\u001b[0m - \u001b[1mComputing gradients...\u001b[0m\n", "\u001b[32m2025-12-07 12:02:39.095\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m349\u001b[0m - \u001b[1mUpdating agents...\u001b[0m\n", "\u001b[32m2025-12-07 12:02:41.389\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m353\u001b[0m - \u001b[1mAgents updated\u001b[0m\n", "\u001b[32m2025-12-07 12:02:41.390\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m271\u001b[0m - \u001b[1mEvaluating the workflow at step 4 ...\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 0%| | 0/50 [00:00.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 24%|██▍ | 12/50 [00:01<00:03, 12.33it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 34%|███▍ | 17/50 [00:02<00:02, 15.62it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.08333333333333333, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 38%|███▊ | 19/50 [00:02<00:02, 10.70it/s]\u001b[A\n", "Evaluating workflow: 42%|████▏ | 21/50 [00:02<00:02, 11.32it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 46%|████▌ | 23/50 [00:03<00:02, 9.22it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 50%|█████ | 25/50 [00:03<00:02, 8.71it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 54%|█████▍ | 27/50 [00:03<00:02, 8.64it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 56%|█████▌ | 28/50 [00:03<00:03, 6.16it/s]\u001b[A\n", "Evaluating workflow: 62%|██████▏ | 31/50 [00:04<00:02, 8.97it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.888888888888889, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:02:45.617\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a72a2935542991f9a20c546', 'answer': 'Velvetpark', 'question': 'Is Velvetpark or Shape magazine written more for a lesbian and queer-identified female readership?', 'supporting_facts': [['Velvetpark', 1], ['Shape (magazine)', 1], ['Shape (magazine)', 7]], 'context': [['Jeguk Sinmun', ['The Jeguk Sinmun (\"Imperial Post\"; 1898-1910) was a Seoul-based Korean language newspaper founded in 1898 by Yi Jong-myeon.', ' It was published using the purely vernacular Hangeul script and attracted a largely lower or middle class and female readership.', ' It was less political than the other papers of the period, concentrating instead on social issues.', ' One of its early reporters was the young Syngman Rhee.']], ['Velvetpark', ['Velvetpark: Dyke Culture in Bloom is a lesbian and feminist arts and culture website that regularly features music, literature, theater, fine arts, film, television, and social activism as it impacts queer culture.', ' \"Velvetpark\" also hosts a social network and dating community for lesbians and queer-identified women.']], ['Anna Kalata', ['Anna Kalata (born May 10, 1964, Milanówek, Poland) is a Polish politician, celebrity and occasional actress.', ' She was a member of the populist Samoobrona party.', \" In Jarosław Kaczyński's cabinet she was the minister of labour and social policy.\", ' She participated in the 12th season of Taniec z Gwiazdami (the Polish version of Dancing With The Stars).', ' After losing 38 kg she appeared on the cover of Shape magazine.']], ['Shape (magazine)', [\"Shape is a monthly English language fitness magazine started by Weider Publications in 1981, founded by Christine MacIntyre (a pioneer in women's free weight fitness) and became the number one women's fitness magazine.\", ' At that time, Weider Enterprises consisted primarily of the bodybuilding magazine \"Muscle & Fitness\".', ' Joe Weider and Christine MacIntyre had differing views of how to present \"Shape\", Weider endorsing a less journalistic and more commercial approach to articles, MacIntyre endorsing a more academic, doctor-based magazine.', ' Weider also endorsed a sexier approach to editorial while MacIntyre endorsed a healthier look for women, eschewing sexiness in the models and the copy.', ' MacIntyre largely won that battle, editing a magazine that required that every byline have an advanced medical degree, that cover models should look healthy rather than sexy, and that sexist language be avoided.', ' Christine MacIntyre was the editor-in-chief until her death in 1988.', ' Tara Kraft is the current editor-in-chief.', ' \"Shape\" found a readership based on that formula.']], ['Cynthia Heimel', ['Cynthia Heimel (née Glick) (born 1947 in Philadelphia) is a feminist humorist writer from Oakland, California.', ' She is a columnist and the author of satirical books primarily aimed at a female readership and known for their unusual titles, as well as a playwright and television writer.']], ['Femme', ['Femme is a lesbian sexual identity that was created in the working class lesbian bar culture of the 1950s.', ' It is a term used to distinguish feminine lesbian and bisexual women from their butch/stud lesbian counterparts and partners.', ' Today the term is still used in this way but in recent years - following the influence of Queer gender identity theories - its meaning has, sometimes contentiously, been expanded to describe a queer-identified person who is feminine in their presentation regardless of their gender or sexuality.']], ['Chapstick lesbian', ['A chapstick lesbian is a sub-group within lesbianism that Ellen DeGeneres popularised in 1997 in her show \"Ellen\".', ' It was originally constructed as response to the phrase \"lipstick lesbian\" that emerged in 1990, which refers to a femme lesbian who emphasises their female identity through their self-presentation.', ' The slang term \"chapstick lesbian\" identifies a category on the femme-butch lesbian continuum, where the female homosexual has a gender identity bias towards femme lesbianism, although does not identify or fit the criteria of being a lipstick lesbian.', ' The word is frequently used as an alternative to the term \"soft-butch\" lesbian or androgynous.', ' The key attributes recognisable of a chapstick lesbian is that they have a casual dress-code and lack of desire to wear make-up.', ' Next to this, they are also viewed as being athletic in nature and have a notable interest in sport.']], ['Elana Amsterdam', ['Elana Amsterdam is the New York Times Bestselling author of \"Paleo Cooking from Elana\\'s Pantry\".', ' She writes cookbooks for gluten-free cooking, using almond flour and coconut flour as a gluten-free alternative to wheat flour.', ' Her book, \"The Gluten-Free Almond Flour Cookbook\", was named one of the \"Best Cookbooks of 2009\" by The Denver Post.', ' Amsterdam has partnered with the California Almond Board in conjunction with her works.', ' Her blog, elanaspantry.com, was named one of the top 50 food blogs by Cision.', \" Amsterdam contributed an article to Shape Magazine and she was featured on Fox News's On the Hunt with Jonathan Hunt.\"]], ['Shōjo manga', ['Shōjo, shojo, or shoujo manga (少女漫画 , shōjo manga ) is manga aimed at a teenage female readership.', ' The name romanizes the Japanese 少女 (shōjo), literally \"young woman\".', ' Shōjo manga covers many subjects in a variety of narrative styles, from historical drama to science fiction, often with a focus on romantic relationships or emotions.', ' Strictly speaking, however, shōjo manga does not comprise a style or genre, but rather indicates a target demographic.']], ['Celesbian', ['The term celesbian (a portmanteau of \"celebrity\" and \"lesbian\") originally referred to a female celebrity known or reputed to be a lesbian and popular within the LGBT community.', ' Celesbianism as a Western media phenomenon came into vogue in 2008, when several female celebrities presented themselves as lesbians.', ' The term was first used by New Yorkers Pam Franco and Susan Levine, a disk jockey.', ' It was used in a full-page ad in a lesbian nightlife magazine, \"GO MAGAZINE\".', ' The ad was for the Mz Hip and Fit NY contest, the idea of Denise Cohen of Denco Designs & Events.', ' The contest was a search for the hottest lesbian in the United States.', ' The term \"celesbian\" was used for the celebrity lesbian judges.']]], 'type': 'comparison', 'level': 'hard'}\n", "Error: The input to LLMOutputParser.parse should be a str, but found .\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 68%|██████▊ | 34/50 [00:04<00:01, 11.50it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 72%|███████▏ | 36/50 [00:04<00:01, 11.82it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 82%|████████▏ | 41/50 [00:04<00:00, 11.27it/s]\u001b[A\n", "Evaluating workflow: 86%|████████▌ | 43/50 [00:04<00:00, 11.64it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 90%|█████████ | 45/50 [00:05<00:00, 7.05it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 94%|█████████▍| 47/50 [00:06<00:00, 6.10it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 96%|█████████▌| 48/50 [00:06<00:00, 5.57it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.16, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 100%|██████████| 50/50 [00:08<00:00, 6.12it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:02:49.669\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m275\u001b[0m - \u001b[1mStep 4 metrics: {'f1': 0.6705847162097162, 'em': 0.4791666666666667, 'acc': 0.7083333333333334}\u001b[0m\n", "\u001b[32m2025-12-07 12:02:49.669\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m291\u001b[0m - \u001b[1mMetrics are worse than the best snapshot which has {'f1': 0.7166464418583675, 'em': 0.5416666666666666, 'acc': 0.7708333333333334}. Rolling back to the best snapshot.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "\r", " 20%|██ | 4/20 [04:22<17:25, 65.33s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:02:49.673\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m319\u001b[0m - \u001b[1mExecuting workflow...\u001b[0m\n", "\u001b[32m2025-12-07 12:02:59.347\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m347\u001b[0m - \u001b[1mComputing gradients...\u001b[0m\n", "\u001b[32m2025-12-07 12:03:43.161\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m349\u001b[0m - \u001b[1mUpdating agents...\u001b[0m\n", "\u001b[32m2025-12-07 12:03:46.063\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m353\u001b[0m - \u001b[1mAgents updated\u001b[0m\n", "\u001b[32m2025-12-07 12:03:46.063\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m271\u001b[0m - \u001b[1mEvaluating the workflow at step 5 ...\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 0%| | 0/50 [00:00.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 80%|████████ | 40/50 [00:04<00:00, 10.97it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.3157894736842105, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 84%|████████▍ | 42/50 [00:05<00:00, 9.21it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 88%|████████▊ | 44/50 [00:05<00:00, 9.17it/s]\u001b[A\n", "Evaluating workflow: 94%|█████████▍| 47/50 [00:05<00:00, 11.01it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.17391304347826084, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 98%|█████████▊| 49/50 [00:06<00:00, 6.22it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 100%|██████████| 50/50 [00:10<00:00, 4.76it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:03:56.665\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m275\u001b[0m - \u001b[1mStep 5 metrics: {'f1': 0.7010563245759154, 'em': 0.5510204081632653, 'acc': 0.8163265306122449}\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "\r", " 25%|██▌ | 5/20 [05:29<16:28, 65.93s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:03:56.666\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m319\u001b[0m - \u001b[1mExecuting workflow...\u001b[0m\n", "\u001b[32m2025-12-07 12:04:06.251\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m347\u001b[0m - \u001b[1mComputing gradients...\u001b[0m\n", "\u001b[32m2025-12-07 12:04:55.794\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m349\u001b[0m - \u001b[1mUpdating agents...\u001b[0m\n", "\u001b[32m2025-12-07 12:04:58.678\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m353\u001b[0m - \u001b[1mAgents updated\u001b[0m\n", "\u001b[32m2025-12-07 12:04:58.679\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m271\u001b[0m - \u001b[1mEvaluating the workflow at step 6 ...\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 0%| | 0/50 [00:00.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 78%|███████▊ | 39/50 [00:05<00:01, 10.31it/s]\u001b[A\n", "Evaluating workflow: 82%|████████▏ | 41/50 [00:05<00:01, 8.14it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 88%|████████▊ | 44/50 [00:05<00:00, 9.00it/s]\u001b[A\n", "Evaluating workflow: 92%|█████████▏| 46/50 [00:05<00:00, 10.14it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 96%|█████████▌| 48/50 [00:05<00:00, 11.31it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.23529411764705882, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 100%|██████████| 50/50 [00:08<00:00, 5.61it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:05:07.689\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m275\u001b[0m - \u001b[1mStep 6 metrics: {'f1': 0.7125217291217563, 'em': 0.5102040816326531, 'acc': 0.7551020408163265}\u001b[0m\n", "\u001b[32m2025-12-07 12:05:07.689\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m291\u001b[0m - \u001b[1mMetrics are worse than the best snapshot which has {'f1': 0.7010563245759154, 'em': 0.5510204081632653, 'acc': 0.8163265306122449}. Rolling back to the best snapshot.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "\r", " 30%|███ | 6/20 [06:40<15:47, 67.66s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:05:07.692\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m319\u001b[0m - \u001b[1mExecuting workflow...\u001b[0m\n", "\u001b[32m2025-12-07 12:05:18.394\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m347\u001b[0m - \u001b[1mComputing gradients...\u001b[0m\n", "\u001b[32m2025-12-07 12:06:05.322\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m349\u001b[0m - \u001b[1mUpdating agents...\u001b[0m\n", "\u001b[32m2025-12-07 12:06:08.685\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m353\u001b[0m - \u001b[1mAgents updated\u001b[0m\n", "\u001b[32m2025-12-07 12:06:08.686\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m271\u001b[0m - \u001b[1mEvaluating the workflow at step 7 ...\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 0%| | 0/50 [00:00.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 8%|▊ | 4/50 [00:01<00:15, 3.05it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 16%|█▌ | 8/50 [00:01<00:06, 6.86it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 22%|██▏ | 11/50 [00:01<00:04, 8.87it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.7272727272727273, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 28%|██▊ | 14/50 [00:02<00:04, 8.98it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.08333333333333333, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 32%|███▏ | 16/50 [00:02<00:03, 9.41it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.13333333333333333, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 36%|███▌ | 18/50 [00:02<00:04, 7.82it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 40%|████ | 20/50 [00:03<00:04, 7.09it/s]\u001b[A\n", "Evaluating workflow: 44%|████▍ | 22/50 [00:03<00:03, 8.03it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 48%|████▊ | 24/50 [00:03<00:03, 8.00it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 52%|█████▏ | 26/50 [00:03<00:02, 8.98it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 56%|█████▌ | 28/50 [00:04<00:02, 8.95it/s]\u001b[A\n", "Evaluating workflow: 60%|██████ | 30/50 [00:04<00:02, 9.51it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.888888888888889, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:06:13.102\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a72a2935542991f9a20c546', 'answer': 'Velvetpark', 'question': 'Is Velvetpark or Shape magazine written more for a lesbian and queer-identified female readership?', 'supporting_facts': [['Velvetpark', 1], ['Shape (magazine)', 1], ['Shape (magazine)', 7]], 'context': [['Jeguk Sinmun', ['The Jeguk Sinmun (\"Imperial Post\"; 1898-1910) was a Seoul-based Korean language newspaper founded in 1898 by Yi Jong-myeon.', ' It was published using the purely vernacular Hangeul script and attracted a largely lower or middle class and female readership.', ' It was less political than the other papers of the period, concentrating instead on social issues.', ' One of its early reporters was the young Syngman Rhee.']], ['Velvetpark', ['Velvetpark: Dyke Culture in Bloom is a lesbian and feminist arts and culture website that regularly features music, literature, theater, fine arts, film, television, and social activism as it impacts queer culture.', ' \"Velvetpark\" also hosts a social network and dating community for lesbians and queer-identified women.']], ['Anna Kalata', ['Anna Kalata (born May 10, 1964, Milanówek, Poland) is a Polish politician, celebrity and occasional actress.', ' She was a member of the populist Samoobrona party.', \" In Jarosław Kaczyński's cabinet she was the minister of labour and social policy.\", ' She participated in the 12th season of Taniec z Gwiazdami (the Polish version of Dancing With The Stars).', ' After losing 38 kg she appeared on the cover of Shape magazine.']], ['Shape (magazine)', [\"Shape is a monthly English language fitness magazine started by Weider Publications in 1981, founded by Christine MacIntyre (a pioneer in women's free weight fitness) and became the number one women's fitness magazine.\", ' At that time, Weider Enterprises consisted primarily of the bodybuilding magazine \"Muscle & Fitness\".', ' Joe Weider and Christine MacIntyre had differing views of how to present \"Shape\", Weider endorsing a less journalistic and more commercial approach to articles, MacIntyre endorsing a more academic, doctor-based magazine.', ' Weider also endorsed a sexier approach to editorial while MacIntyre endorsed a healthier look for women, eschewing sexiness in the models and the copy.', ' MacIntyre largely won that battle, editing a magazine that required that every byline have an advanced medical degree, that cover models should look healthy rather than sexy, and that sexist language be avoided.', ' Christine MacIntyre was the editor-in-chief until her death in 1988.', ' Tara Kraft is the current editor-in-chief.', ' \"Shape\" found a readership based on that formula.']], ['Cynthia Heimel', ['Cynthia Heimel (née Glick) (born 1947 in Philadelphia) is a feminist humorist writer from Oakland, California.', ' She is a columnist and the author of satirical books primarily aimed at a female readership and known for their unusual titles, as well as a playwright and television writer.']], ['Femme', ['Femme is a lesbian sexual identity that was created in the working class lesbian bar culture of the 1950s.', ' It is a term used to distinguish feminine lesbian and bisexual women from their butch/stud lesbian counterparts and partners.', ' Today the term is still used in this way but in recent years - following the influence of Queer gender identity theories - its meaning has, sometimes contentiously, been expanded to describe a queer-identified person who is feminine in their presentation regardless of their gender or sexuality.']], ['Chapstick lesbian', ['A chapstick lesbian is a sub-group within lesbianism that Ellen DeGeneres popularised in 1997 in her show \"Ellen\".', ' It was originally constructed as response to the phrase \"lipstick lesbian\" that emerged in 1990, which refers to a femme lesbian who emphasises their female identity through their self-presentation.', ' The slang term \"chapstick lesbian\" identifies a category on the femme-butch lesbian continuum, where the female homosexual has a gender identity bias towards femme lesbianism, although does not identify or fit the criteria of being a lipstick lesbian.', ' The word is frequently used as an alternative to the term \"soft-butch\" lesbian or androgynous.', ' The key attributes recognisable of a chapstick lesbian is that they have a casual dress-code and lack of desire to wear make-up.', ' Next to this, they are also viewed as being athletic in nature and have a notable interest in sport.']], ['Elana Amsterdam', ['Elana Amsterdam is the New York Times Bestselling author of \"Paleo Cooking from Elana\\'s Pantry\".', ' She writes cookbooks for gluten-free cooking, using almond flour and coconut flour as a gluten-free alternative to wheat flour.', ' Her book, \"The Gluten-Free Almond Flour Cookbook\", was named one of the \"Best Cookbooks of 2009\" by The Denver Post.', ' Amsterdam has partnered with the California Almond Board in conjunction with her works.', ' Her blog, elanaspantry.com, was named one of the top 50 food blogs by Cision.', \" Amsterdam contributed an article to Shape Magazine and she was featured on Fox News's On the Hunt with Jonathan Hunt.\"]], ['Shōjo manga', ['Shōjo, shojo, or shoujo manga (少女漫画 , shōjo manga ) is manga aimed at a teenage female readership.', ' The name romanizes the Japanese 少女 (shōjo), literally \"young woman\".', ' Shōjo manga covers many subjects in a variety of narrative styles, from historical drama to science fiction, often with a focus on romantic relationships or emotions.', ' Strictly speaking, however, shōjo manga does not comprise a style or genre, but rather indicates a target demographic.']], ['Celesbian', ['The term celesbian (a portmanteau of \"celebrity\" and \"lesbian\") originally referred to a female celebrity known or reputed to be a lesbian and popular within the LGBT community.', ' Celesbianism as a Western media phenomenon came into vogue in 2008, when several female celebrities presented themselves as lesbians.', ' The term was first used by New Yorkers Pam Franco and Susan Levine, a disk jockey.', ' It was used in a full-page ad in a lesbian nightlife magazine, \"GO MAGAZINE\".', ' The ad was for the Mz Hip and Fit NY contest, the idea of Denise Cohen of Denco Designs & Events.', ' The contest was a search for the hottest lesbian in the United States.', ' The term \"celesbian\" was used for the celebrity lesbian judges.']]], 'type': 'comparison', 'level': 'hard'}\n", "Error: The input to LLMOutputParser.parse should be a str, but found .\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 64%|██████▍ | 32/50 [00:04<00:01, 10.39it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 68%|██████▊ | 34/50 [00:04<00:01, 8.93it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 70%|███████ | 35/50 [00:04<00:02, 7.28it/s]\u001b[A\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.19354838709677416, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 72%|███████▏ | 36/50 [00:05<00:02, 6.68it/s]\u001b[A\n", "Evaluating workflow: 76%|███████▌ | 38/50 [00:05<00:01, 8.25it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.1818181818181818, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 82%|████████▏ | 41/50 [00:05<00:00, 10.04it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 88%|████████▊ | 44/50 [00:05<00:00, 9.89it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 96%|█████████▌| 48/50 [00:05<00:00, 12.35it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.17391304347826084, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 100%|██████████| 50/50 [00:06<00:00, 7.35it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:06:15.590\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m275\u001b[0m - \u001b[1mStep 7 metrics: {'f1': 0.6700637557536226, 'em': 0.4791666666666667, 'acc': 0.7708333333333334}\u001b[0m\n", "\u001b[32m2025-12-07 12:06:15.590\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m291\u001b[0m - \u001b[1mMetrics are worse than the best snapshot which has {'f1': 0.7010563245759154, 'em': 0.5510204081632653, 'acc': 0.8163265306122449}. Rolling back to the best snapshot.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "\r", " 35%|███▌ | 7/20 [07:48<14:40, 67.74s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:06:15.594\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m319\u001b[0m - \u001b[1mExecuting workflow...\u001b[0m\n", "\u001b[32m2025-12-07 12:06:28.148\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m347\u001b[0m - \u001b[1mComputing gradients...\u001b[0m\n", "\u001b[32m2025-12-07 12:07:09.078\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m349\u001b[0m - \u001b[1mUpdating agents...\u001b[0m\n", "\u001b[32m2025-12-07 12:07:11.849\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m353\u001b[0m - \u001b[1mAgents updated\u001b[0m\n", "\u001b[32m2025-12-07 12:07:11.850\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m271\u001b[0m - \u001b[1mEvaluating the workflow at step 8 ...\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 0%| | 0/50 [00:00.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 18%|█▊ | 9/50 [00:01<00:04, 10.07it/s]\u001b[A\n", "Evaluating workflow: 24%|██▍ | 12/50 [00:01<00:03, 12.35it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 30%|███ | 15/50 [00:02<00:02, 14.79it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.08333333333333333, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 36%|███▌ | 18/50 [00:02<00:03, 9.05it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 40%|████ | 20/50 [00:03<00:03, 7.64it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 44%|████▍ | 22/50 [00:03<00:03, 7.79it/s]\u001b[A\n", "Evaluating workflow: 50%|█████ | 25/50 [00:03<00:02, 10.49it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 54%|█████▍ | 27/50 [00:03<00:02, 10.50it/s]\u001b[A\n", "Evaluating workflow: 60%|██████ | 30/50 [00:03<00:01, 12.79it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.888888888888889, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.16666666666666669, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:07:15.935\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a72a2935542991f9a20c546', 'answer': 'Velvetpark', 'question': 'Is Velvetpark or Shape magazine written more for a lesbian and queer-identified female readership?', 'supporting_facts': [['Velvetpark', 1], ['Shape (magazine)', 1], ['Shape (magazine)', 7]], 'context': [['Jeguk Sinmun', ['The Jeguk Sinmun (\"Imperial Post\"; 1898-1910) was a Seoul-based Korean language newspaper founded in 1898 by Yi Jong-myeon.', ' It was published using the purely vernacular Hangeul script and attracted a largely lower or middle class and female readership.', ' It was less political than the other papers of the period, concentrating instead on social issues.', ' One of its early reporters was the young Syngman Rhee.']], ['Velvetpark', ['Velvetpark: Dyke Culture in Bloom is a lesbian and feminist arts and culture website that regularly features music, literature, theater, fine arts, film, television, and social activism as it impacts queer culture.', ' \"Velvetpark\" also hosts a social network and dating community for lesbians and queer-identified women.']], ['Anna Kalata', ['Anna Kalata (born May 10, 1964, Milanówek, Poland) is a Polish politician, celebrity and occasional actress.', ' She was a member of the populist Samoobrona party.', \" In Jarosław Kaczyński's cabinet she was the minister of labour and social policy.\", ' She participated in the 12th season of Taniec z Gwiazdami (the Polish version of Dancing With The Stars).', ' After losing 38 kg she appeared on the cover of Shape magazine.']], ['Shape (magazine)', [\"Shape is a monthly English language fitness magazine started by Weider Publications in 1981, founded by Christine MacIntyre (a pioneer in women's free weight fitness) and became the number one women's fitness magazine.\", ' At that time, Weider Enterprises consisted primarily of the bodybuilding magazine \"Muscle & Fitness\".', ' Joe Weider and Christine MacIntyre had differing views of how to present \"Shape\", Weider endorsing a less journalistic and more commercial approach to articles, MacIntyre endorsing a more academic, doctor-based magazine.', ' Weider also endorsed a sexier approach to editorial while MacIntyre endorsed a healthier look for women, eschewing sexiness in the models and the copy.', ' MacIntyre largely won that battle, editing a magazine that required that every byline have an advanced medical degree, that cover models should look healthy rather than sexy, and that sexist language be avoided.', ' Christine MacIntyre was the editor-in-chief until her death in 1988.', ' Tara Kraft is the current editor-in-chief.', ' \"Shape\" found a readership based on that formula.']], ['Cynthia Heimel', ['Cynthia Heimel (née Glick) (born 1947 in Philadelphia) is a feminist humorist writer from Oakland, California.', ' She is a columnist and the author of satirical books primarily aimed at a female readership and known for their unusual titles, as well as a playwright and television writer.']], ['Femme', ['Femme is a lesbian sexual identity that was created in the working class lesbian bar culture of the 1950s.', ' It is a term used to distinguish feminine lesbian and bisexual women from their butch/stud lesbian counterparts and partners.', ' Today the term is still used in this way but in recent years - following the influence of Queer gender identity theories - its meaning has, sometimes contentiously, been expanded to describe a queer-identified person who is feminine in their presentation regardless of their gender or sexuality.']], ['Chapstick lesbian', ['A chapstick lesbian is a sub-group within lesbianism that Ellen DeGeneres popularised in 1997 in her show \"Ellen\".', ' It was originally constructed as response to the phrase \"lipstick lesbian\" that emerged in 1990, which refers to a femme lesbian who emphasises their female identity through their self-presentation.', ' The slang term \"chapstick lesbian\" identifies a category on the femme-butch lesbian continuum, where the female homosexual has a gender identity bias towards femme lesbianism, although does not identify or fit the criteria of being a lipstick lesbian.', ' The word is frequently used as an alternative to the term \"soft-butch\" lesbian or androgynous.', ' The key attributes recognisable of a chapstick lesbian is that they have a casual dress-code and lack of desire to wear make-up.', ' Next to this, they are also viewed as being athletic in nature and have a notable interest in sport.']], ['Elana Amsterdam', ['Elana Amsterdam is the New York Times Bestselling author of \"Paleo Cooking from Elana\\'s Pantry\".', ' She writes cookbooks for gluten-free cooking, using almond flour and coconut flour as a gluten-free alternative to wheat flour.', ' Her book, \"The Gluten-Free Almond Flour Cookbook\", was named one of the \"Best Cookbooks of 2009\" by The Denver Post.', ' Amsterdam has partnered with the California Almond Board in conjunction with her works.', ' Her blog, elanaspantry.com, was named one of the top 50 food blogs by Cision.', \" Amsterdam contributed an article to Shape Magazine and she was featured on Fox News's On the Hunt with Jonathan Hunt.\"]], ['Shōjo manga', ['Shōjo, shojo, or shoujo manga (少女漫画 , shōjo manga ) is manga aimed at a teenage female readership.', ' The name romanizes the Japanese 少女 (shōjo), literally \"young woman\".', ' Shōjo manga covers many subjects in a variety of narrative styles, from historical drama to science fiction, often with a focus on romantic relationships or emotions.', ' Strictly speaking, however, shōjo manga does not comprise a style or genre, but rather indicates a target demographic.']], ['Celesbian', ['The term celesbian (a portmanteau of \"celebrity\" and \"lesbian\") originally referred to a female celebrity known or reputed to be a lesbian and popular within the LGBT community.', ' Celesbianism as a Western media phenomenon came into vogue in 2008, when several female celebrities presented themselves as lesbians.', ' The term was first used by New Yorkers Pam Franco and Susan Levine, a disk jockey.', ' It was used in a full-page ad in a lesbian nightlife magazine, \"GO MAGAZINE\".', ' The ad was for the Mz Hip and Fit NY contest, the idea of Denise Cohen of Denco Designs & Events.', ' The contest was a search for the hottest lesbian in the United States.', ' The term \"celesbian\" was used for the celebrity lesbian judges.']]], 'type': 'comparison', 'level': 'hard'}\n", "Error: The input to LLMOutputParser.parse should be a str, but found .\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 66%|██████▌ | 33/50 [00:03<00:01, 11.59it/s]\u001b[A\n", "Evaluating workflow: 70%|███████ | 35/50 [00:04<00:01, 12.20it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.047619047619047616, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 76%|███████▌ | 38/50 [00:04<00:01, 8.35it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.19354838709677416, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 80%|████████ | 40/50 [00:05<00:01, 7.78it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 84%|████████▍ | 42/50 [00:05<00:00, 8.80it/s]\u001b[A\n", "Evaluating workflow: 92%|█████████▏| 46/50 [00:05<00:00, 13.02it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.3157894736842105, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 96%|█████████▌| 48/50 [00:05<00:00, 13.07it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.17391304347826084, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 100%|██████████| 50/50 [00:05<00:00, 8.59it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:07:17.770\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m275\u001b[0m - \u001b[1mStep 8 metrics: {'f1': 0.6829738719532124, 'em': 0.4791666666666667, 'acc': 0.75}\u001b[0m\n", "\u001b[32m2025-12-07 12:07:17.771\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m291\u001b[0m - \u001b[1mMetrics are worse than the best snapshot which has {'f1': 0.7010563245759154, 'em': 0.5510204081632653, 'acc': 0.8163265306122449}. Rolling back to the best snapshot.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "\r", " 40%|████ | 8/20 [08:50<13:11, 65.97s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:07:17.774\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m319\u001b[0m - \u001b[1mExecuting workflow...\u001b[0m\n", "\u001b[32m2025-12-07 12:07:28.493\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m347\u001b[0m - \u001b[1mComputing gradients...\u001b[0m\n", "\u001b[32m2025-12-07 12:08:10.450\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m349\u001b[0m - \u001b[1mUpdating agents...\u001b[0m\n", "\u001b[32m2025-12-07 12:08:13.542\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m353\u001b[0m - \u001b[1mAgents updated\u001b[0m\n", "\u001b[32m2025-12-07 12:08:13.543\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m271\u001b[0m - \u001b[1mEvaluating the workflow at step 9 ...\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707014.651585676)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707015.965959685)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707016.060470153)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707016.091789758)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707016.424620032)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707018.43295832)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707017.931121184)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707018.057930635)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707018.09579977)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707017.946148412)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707018.362419236)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707019.365146125)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707020.809342816)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707018.01410759)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707017.944564477)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707072.201111076)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707021.624524193)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707021.209854787)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707020.326296186)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707021.02197791)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707072.656508552)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707072.02956096)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707072.169151496)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707072.710586151)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707072.546512447)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707074.736618614)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707074.881183779)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707074.321978008)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707074.75477134)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707075.122248787)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707076.133859009)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707076.486066028)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707077.178895252)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707076.833429825)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707076.094679408)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707074.370659487)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707074.718743848)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707076.678543812)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707078.884836895)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707078.814233469)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707077.996855205)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707079.330013072)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707076.55318922)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707080.27177372)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707078.734375795)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707078.564633254)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707078.801110856)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707078.352527631)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707081.693444692)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707081.000979144)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707081.711512964)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707081.821083685)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707081.8979512)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707080.471957617)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707080.578076333)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707080.833296085)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707082.284664808)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707082.360441271)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707133.28095923)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707132.400030317)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707135.109370796)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707134.317599377)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707134.27891294)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707134.000183556)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707135.108207133)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707135.416479038)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707134.498537134)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707135.11440942)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707136.382399604)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707135.105847092)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707136.981138304)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707137.147356326)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707137.364393743)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707138.113489505)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707135.773863537)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707134.662620796)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707136.920408923)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707133.879241943)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707134.192377906)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707136.820180157)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707136.82915832)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707137.208241628)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707136.585364196)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707135.110650038)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707137.286773594)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707136.529320944)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707014.161872149)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707014.107201245)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707014.5073159)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707015.974166002)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707016.645190803)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707016.566470508)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707016.359978324)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707015.838045632)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707016.604387034)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707016.46097257)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707016.830847204)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707016.90678297)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707016.486909095)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707018.304462776)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707018.309048984)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707018.14788998)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707017.849422144)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707017.808494205)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707017.927107904)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707018.11816962)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707018.373055236)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707018.455969328)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707019.2988176)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707019.96784685)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707019.864559844)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707020.05098664)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707020.212158889)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9707019.676274871)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707019.751595158)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707020.52271389)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707019.835740872)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707019.568634998)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707020.8889322)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707021.4968964)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707021.155311111)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707022.222433913)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707021.3432812)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707021.837377371)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707072.569126667)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707021.274468336)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707022.123499475)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707022.207269806)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707022.286022265)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707022.105061565)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707072.870593712)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707072.883223513)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707072.522478549)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707072.924595593)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707071.749005826)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707071.775426632)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707072.045846896)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707074.603105264)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707073.707666013)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707073.42869716)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707074.534368927)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707074.273706732)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707075.230427904)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707074.730267085)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707073.319782026)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707074.380782004)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707074.138917135)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707077.534251872)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707079.473897576)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707076.524331767)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707076.069238491)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707076.741042092)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707076.401543347)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707076.871115696)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707076.34732142)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9707076.619224215)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707078.37198732)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707077.955240363)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707078.0811032)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707078.512152757)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707078.509870827)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707078.201348115)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707079.234250585)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707078.993439792)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707079.554248262)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707080.443288138)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707079.900125109)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707080.139122063)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707081.43639718)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707080.140862536)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707080.09893778)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707080.010495916)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707080.730581397)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707082.095106823)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707081.543909077)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707081.660515772)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707133.387269309)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707082.44549804)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707082.087155476)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707081.884144524)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707081.9891482)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707083.0108052)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707133.133791842)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707131.82532695)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707133.635214297)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707131.98344116)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707131.748767108)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707132.480459215)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707132.738260452)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707131.818610026)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707132.539194109)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707132.6214648)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707132.025332708)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707134.037857164)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707133.653496008)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707133.520276347)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707019.738637699)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707079.818522615)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707079.062895624)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707135.434424467)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707137.227059832)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707136.691386037)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707137.26144008)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707136.680893755)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707195.14942414)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707193.256819308)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707193.5835731)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707193.016428988)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707193.37083896)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707195.037927976)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707193.271060033)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707193.930602808)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707193.437452324)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707193.364462553)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707193.397170372)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707193.193141755)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707194.228179386)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707195.195679108)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707195.547663912)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707195.193706892)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707193.960775128)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707193.657179188)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707193.63385952)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707193.976921678)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707193.247869357)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707193.546777317)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707193.53738434)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707193.653209087)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707194.676299777)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707195.015698412)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707195.148265507)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707196.058266873)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707195.864820758)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707195.15095872)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707195.196911588)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707196.332243368)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707195.889426708)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707196.261921495)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707195.250782968)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707195.298643384)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707195.396642176)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707196.424164617)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707195.954519909)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707196.427986916)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707197.342826452)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707196.481944064)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707197.425958045)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707197.222986907)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707197.198604846)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707196.701653508)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707196.943473687)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707196.540706996)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707196.840432502)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9707202.990139926)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707264.899401132)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707265.876610484)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707265.194543168)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707265.467901168)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707265.161109613)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707265.3769585)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707265.074297652)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707264.911487555)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707265.027962849)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707265.185578166)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707265.531422056)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707264.830953239)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707265.022596436)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707265.897882454)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707264.94339075)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707265.26921202)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707265.372508297)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707265.334328571)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707266.20052526)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707266.621677255)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707268.11988211)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707265.28009858)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707265.130449172)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707266.487198643)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707266.38480451)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707266.858040847)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707266.870630229)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707267.76708205)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707266.979871072)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9707266.615372915)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707266.89178754)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707267.598448697)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707267.568015603)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707267.338267827)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707266.96941382)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707266.942159563)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707267.876471931)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707268.033572052)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707268.008441808)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707267.661286924)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707268.009745687)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707267.837650567)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707267.959262662)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707268.869406667)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707268.4226996)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707268.5196899)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707268.112976111)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707268.925714755)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707268.503461655)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707268.830285601)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707333.725018656)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707334.110650655)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707333.622342035)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707333.576244537)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707335.361525312)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707334.284337806)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707334.480100237)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707333.715170853)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9707333.666066067)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707334.57511604)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707334.147668868)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707333.357814793)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707334.002011243)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707333.681078652)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707334.85889017)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707334.17789384)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707334.13335866)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707334.02968151)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707333.686402123)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707333.731368724)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707334.100913111)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707335.404150328)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707336.315574711)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707335.16137676)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707335.321693173)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707335.790093267)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707335.602981444)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707335.983865572)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707335.647610627)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707335.50109625)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707335.289999064)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707335.61568494)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707335.562110497)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707337.06570932)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707337.0484294)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707336.105973983)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707337.636288296)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707336.84873364)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707337.183015252)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707336.749145068)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707335.902723515)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707336.632287243)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707335.822114432)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707338.11653461)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707337.217429943)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707338.257676808)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707337.076594085)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707337.143205607)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707336.950080456)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707337.747388775)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707394.436019776)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707394.335248688)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707393.663095184)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707394.14300396)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707395.004668975)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707394.462268027)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707394.22340624)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707394.399159122)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707394.376445897)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707393.883772744)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707394.247685485)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707393.806834428)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707395.14883287)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9707394.330518913)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707393.957340987)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707394.413933922)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707395.015949847)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707394.002294077)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707394.611248048)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707394.966075685)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707394.169125725)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707395.746970244)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707395.659211311)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707395.34195212)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707397.416812787)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707397.410509296)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707395.914305598)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707396.541785393)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707396.6373536)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707395.483153548)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707396.38145758)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707395.983655902)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707396.498418001)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707396.586172232)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707396.418488432)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707396.463464784)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707397.2590903)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707396.85468491)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707396.815864677)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707396.80644156)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707396.845664788)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707396.793140313)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707396.642312566)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9707398.786174994)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707397.405947596)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707398.05663903)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707400.630821783)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707398.78082062)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707398.272778125)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707398.516078016)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707459.411703443)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707459.180306492)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707458.56868482)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707459.921031104)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707458.768950747)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707458.39492032)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707459.202751707)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707458.743721347)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707459.500716932)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707458.450233554)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707458.731980724)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707458.7641067)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707458.700052813)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707459.2200793)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707458.434610788)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707459.161608096)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707459.226691924)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707467.626765637)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707460.808665546)])']\n", "connector: \n", "\n", "Evaluating workflow: 0%| | 0/50 [00:00.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 72%|███████▏ | 36/50 [00:04<00:01, 10.01it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.19354838709677416, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 76%|███████▌ | 38/50 [00:04<00:01, 9.64it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 80%|████████ | 40/50 [00:05<00:01, 9.52it/s]\u001b[A\n", "Evaluating workflow: 84%|████████▍ | 42/50 [00:05<00:00, 10.90it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 88%|████████▊ | 44/50 [00:05<00:00, 11.19it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 98%|█████████▊| 49/50 [00:05<00:00, 11.54it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.17391304347826084, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 100%|██████████| 50/50 [00:06<00:00, 7.49it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:08:21.057\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m275\u001b[0m - \u001b[1mStep 9 metrics: {'f1': 0.6854523559041515, 'em': 0.4897959183673469, 'acc': 0.7551020408163265}\u001b[0m\n", "\u001b[32m2025-12-07 12:08:21.058\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m291\u001b[0m - \u001b[1mMetrics are worse than the best snapshot which has {'f1': 0.7010563245759154, 'em': 0.5510204081632653, 'acc': 0.8163265306122449}. Rolling back to the best snapshot.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "\r", " 45%|████▌ | 9/20 [09:53<11:56, 65.13s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:08:21.061\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m319\u001b[0m - \u001b[1mExecuting workflow...\u001b[0m\n", "\u001b[32m2025-12-07 12:08:32.431\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m347\u001b[0m - \u001b[1mComputing gradients...\u001b[0m\n", "\u001b[32m2025-12-07 12:09:18.903\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m349\u001b[0m - \u001b[1mUpdating agents...\u001b[0m\n", "\u001b[32m2025-12-07 12:09:22.721\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m353\u001b[0m - \u001b[1mAgents updated\u001b[0m\n", "\u001b[32m2025-12-07 12:09:22.722\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m271\u001b[0m - \u001b[1mEvaluating the workflow at step 10 ...\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 0%| | 0/50 [00:00.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 64%|██████▍ | 32/50 [00:04<00:01, 11.22it/s]\u001b[A\n", "Evaluating workflow: 70%|███████ | 35/50 [00:04<00:01, 13.05it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 74%|███████▍ | 37/50 [00:04<00:00, 13.36it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 78%|███████▊ | 39/50 [00:04<00:01, 10.86it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 82%|████████▏ | 41/50 [00:04<00:00, 9.53it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 86%|████████▌ | 43/50 [00:05<00:00, 8.52it/s]\u001b[A\n", "Evaluating workflow: 90%|█████████ | 45/50 [00:05<00:00, 9.70it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 94%|█████████▍| 47/50 [00:05<00:00, 6.61it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.19354838709677416, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 98%|█████████▊| 49/50 [00:06<00:00, 6.56it/s]\u001b[A\n", "Evaluating workflow: 100%|██████████| 50/50 [00:06<00:00, 7.95it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.25, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:09:29.113\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m275\u001b[0m - \u001b[1mStep 10 metrics: {'f1': 0.6907472401053704, 'em': 0.5306122448979592, 'acc': 0.7959183673469388}\u001b[0m\n", "\u001b[32m2025-12-07 12:09:29.113\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m291\u001b[0m - \u001b[1mMetrics are worse than the best snapshot which has {'f1': 0.7010563245759154, 'em': 0.5510204081632653, 'acc': 0.8163265306122449}. Rolling back to the best snapshot.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "\r", " 50%|█████ | 10/20 [11:01<11:00, 66.03s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:09:29.117\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m319\u001b[0m - \u001b[1mExecuting workflow...\u001b[0m\n", "\u001b[32m2025-12-07 12:09:40.046\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m347\u001b[0m - \u001b[1mComputing gradients...\u001b[0m\n", "\u001b[32m2025-12-07 12:10:23.104\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m349\u001b[0m - \u001b[1mUpdating agents...\u001b[0m\n", "\u001b[32m2025-12-07 12:10:26.541\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m353\u001b[0m - \u001b[1mAgents updated\u001b[0m\n", "\u001b[32m2025-12-07 12:10:26.542\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m271\u001b[0m - \u001b[1mEvaluating the workflow at step 11 ...\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 0%| | 0/50 [00:00.\u001b[0m\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 88%|████████▊ | 44/50 [00:05<00:00, 11.51it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 92%|█████████▏| 46/50 [00:05<00:00, 10.80it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.048780487804878044, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 96%|█████████▌| 48/50 [00:05<00:00, 10.18it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 100%|██████████| 50/50 [00:06<00:00, 8.11it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:10:32.805\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m275\u001b[0m - \u001b[1mStep 11 metrics: {'f1': 0.6983754133258782, 'em': 0.5102040816326531, 'acc': 0.7346938775510204}\u001b[0m\n", "\u001b[32m2025-12-07 12:10:32.806\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m291\u001b[0m - \u001b[1mMetrics are worse than the best snapshot which has {'f1': 0.7010563245759154, 'em': 0.5510204081632653, 'acc': 0.8163265306122449}. Rolling back to the best snapshot.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "\r", " 55%|█████▌ | 11/20 [12:05<09:47, 65.32s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:10:32.809\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m319\u001b[0m - \u001b[1mExecuting workflow...\u001b[0m\n", "\u001b[32m2025-12-07 12:10:45.634\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m347\u001b[0m - \u001b[1mComputing gradients...\u001b[0m\n", "\u001b[32m2025-12-07 12:11:33.697\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m349\u001b[0m - \u001b[1mUpdating agents...\u001b[0m\n", "\u001b[32m2025-12-07 12:11:36.290\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m353\u001b[0m - \u001b[1mAgents updated\u001b[0m\n", "\u001b[32m2025-12-07 12:11:36.291\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m271\u001b[0m - \u001b[1mEvaluating the workflow at step 12 ...\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 0%| | 0/50 [00:00.\u001b[0m\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 76%|███████▌ | 38/50 [00:04<00:00, 14.08it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 80%|████████ | 40/50 [00:04<00:00, 14.74it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 84%|████████▍ | 42/50 [00:04<00:00, 15.36it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.052631578947368425, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 88%|████████▊ | 44/50 [00:04<00:00, 11.83it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 92%|█████████▏| 46/50 [00:04<00:00, 10.34it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 96%|█████████▌| 48/50 [00:04<00:00, 10.76it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 100%|██████████| 50/50 [00:06<00:00, 8.02it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:11:42.629\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m275\u001b[0m - \u001b[1mStep 12 metrics: {'f1': 0.7466663334278801, 'em': 0.6122448979591837, 'acc': 0.7346938775510204}\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "\r", " 60%|██████ | 12/20 [13:15<08:53, 66.69s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:11:42.631\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m319\u001b[0m - \u001b[1mExecuting workflow...\u001b[0m\n", "\u001b[32m2025-12-07 12:11:54.052\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m347\u001b[0m - \u001b[1mComputing gradients...\u001b[0m\n", "\u001b[32m2025-12-07 12:12:43.334\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m349\u001b[0m - \u001b[1mUpdating agents...\u001b[0m\n", "\u001b[32m2025-12-07 12:12:45.686\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m353\u001b[0m - \u001b[1mAgents updated\u001b[0m\n", "\u001b[32m2025-12-07 12:12:45.687\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m271\u001b[0m - \u001b[1mEvaluating the workflow at step 13 ...\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 0%| | 0/50 [00:00.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 14%|█▍ | 7/50 [00:01<00:05, 7.30it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 24%|██▍ | 12/50 [00:01<00:02, 13.70it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 32%|███▏ | 16/50 [00:01<00:02, 14.17it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 38%|███▊ | 19/50 [00:02<00:02, 14.56it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.7272727272727273, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 42%|████▏ | 21/50 [00:02<00:02, 10.96it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 48%|████▊ | 24/50 [00:02<00:02, 12.90it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 52%|█████▏ | 26/50 [00:02<00:02, 10.15it/s]\u001b[A\n", "Evaluating workflow: 58%|█████▊ | 29/50 [00:03<00:01, 12.67it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 64%|██████▍ | 32/50 [00:03<00:01, 15.45it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.888888888888889, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 70%|███████ | 35/50 [00:03<00:01, 8.77it/s]\u001b[A\n", "Evaluating workflow: 76%|███████▌ | 38/50 [00:04<00:01, 10.55it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 80%|████████ | 40/50 [00:04<00:00, 10.65it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 84%|████████▍ | 42/50 [00:04<00:00, 10.90it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:12:50.186\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a72a2935542991f9a20c546', 'answer': 'Velvetpark', 'question': 'Is Velvetpark or Shape magazine written more for a lesbian and queer-identified female readership?', 'supporting_facts': [['Velvetpark', 1], ['Shape (magazine)', 1], ['Shape (magazine)', 7]], 'context': [['Jeguk Sinmun', ['The Jeguk Sinmun (\"Imperial Post\"; 1898-1910) was a Seoul-based Korean language newspaper founded in 1898 by Yi Jong-myeon.', ' It was published using the purely vernacular Hangeul script and attracted a largely lower or middle class and female readership.', ' It was less political than the other papers of the period, concentrating instead on social issues.', ' One of its early reporters was the young Syngman Rhee.']], ['Velvetpark', ['Velvetpark: Dyke Culture in Bloom is a lesbian and feminist arts and culture website that regularly features music, literature, theater, fine arts, film, television, and social activism as it impacts queer culture.', ' \"Velvetpark\" also hosts a social network and dating community for lesbians and queer-identified women.']], ['Anna Kalata', ['Anna Kalata (born May 10, 1964, Milanówek, Poland) is a Polish politician, celebrity and occasional actress.', ' She was a member of the populist Samoobrona party.', \" In Jarosław Kaczyński's cabinet she was the minister of labour and social policy.\", ' She participated in the 12th season of Taniec z Gwiazdami (the Polish version of Dancing With The Stars).', ' After losing 38 kg she appeared on the cover of Shape magazine.']], ['Shape (magazine)', [\"Shape is a monthly English language fitness magazine started by Weider Publications in 1981, founded by Christine MacIntyre (a pioneer in women's free weight fitness) and became the number one women's fitness magazine.\", ' At that time, Weider Enterprises consisted primarily of the bodybuilding magazine \"Muscle & Fitness\".', ' Joe Weider and Christine MacIntyre had differing views of how to present \"Shape\", Weider endorsing a less journalistic and more commercial approach to articles, MacIntyre endorsing a more academic, doctor-based magazine.', ' Weider also endorsed a sexier approach to editorial while MacIntyre endorsed a healthier look for women, eschewing sexiness in the models and the copy.', ' MacIntyre largely won that battle, editing a magazine that required that every byline have an advanced medical degree, that cover models should look healthy rather than sexy, and that sexist language be avoided.', ' Christine MacIntyre was the editor-in-chief until her death in 1988.', ' Tara Kraft is the current editor-in-chief.', ' \"Shape\" found a readership based on that formula.']], ['Cynthia Heimel', ['Cynthia Heimel (née Glick) (born 1947 in Philadelphia) is a feminist humorist writer from Oakland, California.', ' She is a columnist and the author of satirical books primarily aimed at a female readership and known for their unusual titles, as well as a playwright and television writer.']], ['Femme', ['Femme is a lesbian sexual identity that was created in the working class lesbian bar culture of the 1950s.', ' It is a term used to distinguish feminine lesbian and bisexual women from their butch/stud lesbian counterparts and partners.', ' Today the term is still used in this way but in recent years - following the influence of Queer gender identity theories - its meaning has, sometimes contentiously, been expanded to describe a queer-identified person who is feminine in their presentation regardless of their gender or sexuality.']], ['Chapstick lesbian', ['A chapstick lesbian is a sub-group within lesbianism that Ellen DeGeneres popularised in 1997 in her show \"Ellen\".', ' It was originally constructed as response to the phrase \"lipstick lesbian\" that emerged in 1990, which refers to a femme lesbian who emphasises their female identity through their self-presentation.', ' The slang term \"chapstick lesbian\" identifies a category on the femme-butch lesbian continuum, where the female homosexual has a gender identity bias towards femme lesbianism, although does not identify or fit the criteria of being a lipstick lesbian.', ' The word is frequently used as an alternative to the term \"soft-butch\" lesbian or androgynous.', ' The key attributes recognisable of a chapstick lesbian is that they have a casual dress-code and lack of desire to wear make-up.', ' Next to this, they are also viewed as being athletic in nature and have a notable interest in sport.']], ['Elana Amsterdam', ['Elana Amsterdam is the New York Times Bestselling author of \"Paleo Cooking from Elana\\'s Pantry\".', ' She writes cookbooks for gluten-free cooking, using almond flour and coconut flour as a gluten-free alternative to wheat flour.', ' Her book, \"The Gluten-Free Almond Flour Cookbook\", was named one of the \"Best Cookbooks of 2009\" by The Denver Post.', ' Amsterdam has partnered with the California Almond Board in conjunction with her works.', ' Her blog, elanaspantry.com, was named one of the top 50 food blogs by Cision.', \" Amsterdam contributed an article to Shape Magazine and she was featured on Fox News's On the Hunt with Jonathan Hunt.\"]], ['Shōjo manga', ['Shōjo, shojo, or shoujo manga (少女漫画 , shōjo manga ) is manga aimed at a teenage female readership.', ' The name romanizes the Japanese 少女 (shōjo), literally \"young woman\".', ' Shōjo manga covers many subjects in a variety of narrative styles, from historical drama to science fiction, often with a focus on romantic relationships or emotions.', ' Strictly speaking, however, shōjo manga does not comprise a style or genre, but rather indicates a target demographic.']], ['Celesbian', ['The term celesbian (a portmanteau of \"celebrity\" and \"lesbian\") originally referred to a female celebrity known or reputed to be a lesbian and popular within the LGBT community.', ' Celesbianism as a Western media phenomenon came into vogue in 2008, when several female celebrities presented themselves as lesbians.', ' The term was first used by New Yorkers Pam Franco and Susan Levine, a disk jockey.', ' It was used in a full-page ad in a lesbian nightlife magazine, \"GO MAGAZINE\".', ' The ad was for the Mz Hip and Fit NY contest, the idea of Denise Cohen of Denco Designs & Events.', ' The contest was a search for the hottest lesbian in the United States.', ' The term \"celesbian\" was used for the celebrity lesbian judges.']]], 'type': 'comparison', 'level': 'hard'}\n", "Error: The input to LLMOutputParser.parse should be a str, but found .\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 94%|█████████▍| 47/50 [00:04<00:00, 16.14it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 100%|██████████| 50/50 [00:04<00:00, 10.26it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:12:50.664\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m275\u001b[0m - \u001b[1mStep 13 metrics: {'f1': 0.6893247955747955, 'em': 0.5416666666666666, 'acc': 0.7291666666666666}\u001b[0m\n", "\u001b[32m2025-12-07 12:12:50.665\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m291\u001b[0m - \u001b[1mMetrics are worse than the best snapshot which has {'f1': 0.7466663334278801, 'em': 0.6122448979591837, 'acc': 0.7346938775510204}. Rolling back to the best snapshot.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "\r", " 65%|██████▌ | 13/20 [14:23<07:49, 67.10s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:12:50.668\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m319\u001b[0m - \u001b[1mExecuting workflow...\u001b[0m\n", "\u001b[32m2025-12-07 12:13:01.360\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m347\u001b[0m - \u001b[1mComputing gradients...\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 100%|██████████| 50/50 [00:12<00:00, 4.07it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:15:14.669\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m275\u001b[0m - \u001b[1mStep 15 metrics: {'f1': 0.6746326624061587, 'em': 0.5306122448979592, 'acc': 0.7959183673469388}\u001b[0m\n", "\u001b[32m2025-12-07 12:15:14.669\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m291\u001b[0m - \u001b[1mMetrics are worse than the best snapshot which has {'f1': 0.7466663334278801, 'em': 0.6122448979591837, 'acc': 0.7346938775510204}. Rolling back to the best snapshot.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "\r", " 75%|███████▌ | 15/20 [16:47<05:51, 70.37s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:15:14.672\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m319\u001b[0m - \u001b[1mExecuting workflow...\u001b[0m\n", "\u001b[32m2025-12-07 12:15:32.867\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m347\u001b[0m - \u001b[1mComputing gradients...\u001b[0m\n", "\u001b[32m2025-12-07 12:16:18.710\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m349\u001b[0m - \u001b[1mUpdating agents...\u001b[0m\n", "\u001b[32m2025-12-07 12:16:20.991\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m353\u001b[0m - \u001b[1mAgents updated\u001b[0m\n", "\u001b[32m2025-12-07 12:16:20.992\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m271\u001b[0m - \u001b[1mEvaluating the workflow at step 16 ...\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 0%| | 0/50 [00:00.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 76%|███████▌ | 38/50 [00:04<00:00, 12.51it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 80%|████████ | 40/50 [00:04<00:00, 13.19it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 86%|████████▌ | 43/50 [00:04<00:00, 15.33it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 90%|█████████ | 45/50 [00:04<00:00, 14.55it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 94%|█████████▍| 47/50 [00:04<00:00, 10.89it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 100%|██████████| 50/50 [00:06<00:00, 8.09it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:16:27.258\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m275\u001b[0m - \u001b[1mStep 16 metrics: {'f1': 0.7765468091998704, 'em': 0.6530612244897959, 'acc': 0.7142857142857143}\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "\r", " 80%|████████ | 16/20 [18:00<04:44, 71.04s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:16:27.259\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m319\u001b[0m - \u001b[1mExecuting workflow...\u001b[0m\n", "\u001b[32m2025-12-07 12:16:35.180\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m347\u001b[0m - \u001b[1mComputing gradients...\u001b[0m\n", "\u001b[32m2025-12-07 12:17:07.818\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m349\u001b[0m - \u001b[1mUpdating agents...\u001b[0m\n", "\u001b[32m2025-12-07 12:17:10.682\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m353\u001b[0m - \u001b[1mAgents updated\u001b[0m\n", "\u001b[32m2025-12-07 12:17:10.683\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m271\u001b[0m - \u001b[1mEvaluating the workflow at step 17 ...\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 0%| | 0/50 [00:00.\u001b[0m\n", "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 72%|███████▏ | 36/50 [00:12<00:01, 8.17it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.888888888888889, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 76%|███████▌ | 38/50 [00:12<00:01, 8.58it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 82%|████████▏ | 41/50 [00:12<00:00, 10.72it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 88%|████████▊ | 44/50 [00:13<00:00, 10.20it/s]\u001b[A\n", "Evaluating workflow: 92%|█████████▏| 46/50 [00:13<00:00, 10.62it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 96%|█████████▌| 48/50 [00:13<00:00, 9.07it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 100%|██████████| 50/50 [00:16<00:00, 3.06it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:17:27.097\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m275\u001b[0m - \u001b[1mStep 17 metrics: {'f1': 0.7221250404923873, 'em': 0.5918367346938775, 'acc': 0.6938775510204082}\u001b[0m\n", "\u001b[32m2025-12-07 12:17:27.097\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m291\u001b[0m - \u001b[1mMetrics are worse than the best snapshot which has {'f1': 0.7765468091998704, 'em': 0.6530612244897959, 'acc': 0.7142857142857143}. Rolling back to the best snapshot.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "\r", " 85%|████████▌ | 17/20 [18:59<03:23, 67.67s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:17:27.100\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m319\u001b[0m - \u001b[1mExecuting workflow...\u001b[0m\n", "\u001b[32m2025-12-07 12:17:37.912\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m347\u001b[0m - \u001b[1mComputing gradients...\u001b[0m\n", "\u001b[32m2025-12-07 12:18:23.503\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m349\u001b[0m - \u001b[1mUpdating agents...\u001b[0m\n", "\u001b[32m2025-12-07 12:18:25.889\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m353\u001b[0m - \u001b[1mAgents updated\u001b[0m\n", "\u001b[32m2025-12-07 12:18:25.890\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m271\u001b[0m - \u001b[1mEvaluating the workflow at step 18 ...\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 0%| | 0/50 [00:00\n", "Unclosed connector\n", "connections: ['deque([(, 9707458.897005085)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707460.646757677)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707462.720096257)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707462.55132635)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707463.43447793)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707531.502930064)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707531.604946433)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707532.739610877)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707531.447829146)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707531.23346539)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707531.402188633)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707531.795474013)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707532.611386908)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707531.548466388)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707532.104619948)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707531.453896586)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707532.089695835)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707531.720550245)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707532.208302524)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707532.5283118)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707531.662841609)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707534.153606985)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707533.179623809)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707533.9603424)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707535.58561882)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707535.693192083)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707601.436948229)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707603.278930396)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707601.52494194)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707601.71597054)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707601.520268807)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707601.364797892)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707601.419368584)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707602.362500403)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707601.726600124)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707602.176968245)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707601.335657844)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707601.477245819)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707601.982319444)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707605.282617139)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707601.734538365)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707602.062142188)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707602.619244672)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707602.244257051)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707601.549909707)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707604.872714736)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707604.099599978)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707606.552430283)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707664.808411364)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707664.840208163)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707665.633202055)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707664.630380519)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9707664.561348416)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707664.934223657)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707664.584428605)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707664.658107035)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707664.197609393)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707664.785525896)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707664.651333084)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707665.422460513)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707666.59681153)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707664.89920354)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707664.653596155)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707665.041417103)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707664.579668475)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707664.42502676)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707667.061773388)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707666.525267556)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707666.638713796)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707668.385622084)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707668.124271628)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707728.150030686)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707727.152050786)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707727.449807359)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707727.160829764)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707727.02942347)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707726.587130528)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707726.826177992)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707727.912380636)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707728.173179576)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707727.493380733)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707531.631246272)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707532.393864702)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707531.75080848)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707531.577412913)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707664.779357739)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707726.946569698)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707727.1471173)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707727.00749514)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707726.768570852)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707727.517052058)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707728.0879596)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707459.976643054)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707459.77601377)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707459.769393684)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707460.334043223)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707460.746503174)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707461.150625087)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707460.515692104)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707460.626903951)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707460.745349672)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707460.982019087)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707461.257701429)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707461.86714638)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707461.394774824)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707461.495308764)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707462.94863626)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707462.00744564)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707461.27424665)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707461.78619108)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707461.587691844)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707461.492807744)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707461.771099128)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707462.302144432)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707462.56294125)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707462.318999251)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9707462.53948336)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707462.464061152)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707532.684676355)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707533.581585955)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707533.2029318)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707533.600062069)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707533.623306494)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707534.488021959)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707534.02797728)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707534.791566549)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707533.012551034)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707533.410396608)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707538.651495878)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707533.555693036)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707533.764820227)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707534.655101147)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707535.158529622)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707535.1628474)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707534.570335466)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707534.738266915)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707535.712119812)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707535.43984105)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707535.148815213)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707534.380839488)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707535.187208168)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707535.438255284)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707535.56899936)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707601.863138458)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707602.99493914)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707603.38821603)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707604.653806228)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707602.730381448)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707603.543940304)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707603.680628609)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707603.426226487)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707603.769671464)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707603.919616144)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707604.070648592)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707604.36708548)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707605.2119595)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707603.948571471)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707603.136090633)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9707604.39783933)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707605.034809146)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707604.853722507)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707605.286871959)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707605.522600338)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707604.996427532)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707605.787175342)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707605.077575771)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707605.7364713)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707605.552048847)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707605.527497975)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707605.581735525)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707665.921120498)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707665.529409217)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707666.16044383)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707667.620033951)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707666.036195794)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707666.215717595)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707666.457390314)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707666.616541544)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707666.421464363)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707666.26733128)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707666.17136704)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707666.552201692)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707667.041121012)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707667.045116236)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707666.904175133)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707668.120585695)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707668.18209155)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707667.931501783)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707668.07367603)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707667.842244271)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707668.123078441)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707668.194095936)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707668.040440936)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707668.332957951)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707668.732853066)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707602.97016984)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707666.969924016)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707726.78611606)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707727.45107024)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707727.699706657)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707728.141366247)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707728.803578584)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707730.046580551)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707728.389489733)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707729.133802276)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707729.614525376)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707728.572982544)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707728.92337196)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707729.06462101)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707729.33086546)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707729.0878265)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707729.238629123)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707729.658248208)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707729.18317472)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707730.444045836)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707729.917509828)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707730.487318384)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707730.723157132)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707729.692347651)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707730.57036629)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707729.974706532)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707730.22581384)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707731.197319051)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707730.815169916)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9707730.272498945)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707730.81944113)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707730.777257051)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707730.609822348)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707730.857321188)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707730.860581608)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707732.021616085)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707795.535454012)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707795.937854772)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707796.296993144)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707796.82157096)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707795.287786225)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707795.793714892)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707795.659059625)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707795.309183508)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707795.81109254)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707795.423605507)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707795.576114211)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707796.3378657)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707795.461231774)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707795.631916951)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707795.73999006)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707795.733616471)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707796.544687325)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707796.720914204)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707799.631357767)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707797.155641582)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707798.085487356)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707797.516860278)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707795.554965718)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707795.612416472)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9707796.540395616)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707795.376668507)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707797.434766104)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707797.960057102)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707797.497481672)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707797.56829488)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707797.289670464)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707797.510467233)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707797.491597056)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707797.822984515)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707797.797106316)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707798.633148413)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707798.654523)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707798.373055488)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707797.605147855)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707797.94757814)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707797.87021269)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707798.295551607)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707798.77076968)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707799.941217208)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707799.088614304)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707799.718631424)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707798.955105396)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707799.115264056)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707800.075668866)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707799.084336996)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707859.376531629)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707860.421908516)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707860.090272537)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707859.650488265)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707859.583445877)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707858.8523968)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707859.719656665)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707859.772269515)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707859.754098428)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707859.403476452)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707859.034303283)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707860.254857741)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707859.949201502)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707859.557635542)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707859.940891627)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707859.267551696)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707859.562067516)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707859.441199161)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707859.72376001)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707859.205641285)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707860.71259953)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707860.702871356)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707862.564859692)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707860.73104464)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707861.023744399)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707861.359665604)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707861.502408516)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707863.1880529)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707861.647637608)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707861.633817516)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707861.738840844)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707861.65702478)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707861.642312689)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707862.433230052)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707862.405229803)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707863.189368572)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707862.072352814)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707862.422926916)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707862.61963289)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707862.140393931)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707862.77313223)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707862.816749437)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707860.889524393)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707862.736873304)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707862.971278844)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707863.50083556)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707862.392842261)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707863.417089324)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707863.767822685)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707863.31384476)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707928.979205787)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707929.653886493)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707931.181862412)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707929.381065669)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707930.814471968)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707929.315052275)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707929.950529728)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707929.427473504)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707928.517180275)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707929.73260808)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707928.757994728)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707928.33668903)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707929.181562733)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707929.45411886)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707929.275326205)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707928.608860463)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707928.544973953)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707929.539415797)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707928.845055262)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707929.604401212)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707929.202937253)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707930.54933412)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707930.265908897)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707930.116031907)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707931.608184302)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707930.416470945)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707931.385346964)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707930.970517924)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707931.107395556)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707930.371138837)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707929.73152248)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707929.796479654)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707929.711240744)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707931.187460065)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707931.205709271)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707932.184426527)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707932.314872593)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707931.381142486)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707930.97917974)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707931.092199827)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707931.495111736)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707931.19258246)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707931.522339232)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707931.8892392)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707930.778718416)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707931.835063616)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707932.148463167)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707932.052779896)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707932.545583785)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707933.591562428)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707997.994008511)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707998.38372397)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707998.461732056)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707997.881854028)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707998.834407913)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707998.54658932)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707999.288602762)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707998.390590906)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707998.298183909)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707998.331473516)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707998.90510084)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707998.550174085)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707999.432401184)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707998.920941362)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707998.387713905)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707998.20216752)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707998.424032776)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9707997.9857572)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707998.211928904)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707998.491813565)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9707998.72933182)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707999.167461092)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707999.262583612)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707999.743093135)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707999.40867932)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707999.311879376)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707999.885597974)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707999.893846922)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707999.968330065)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707999.779956011)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707999.78655639)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9707999.862453844)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708000.29767445)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708001.154684462)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708000.78753156)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708000.667322263)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708001.226111129)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708001.139761072)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708000.99182574)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708000.591610132)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708000.888475684)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708001.2861209)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708000.631340599)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708000.680662896)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708001.279969772)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9708001.593978835)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708001.145380707)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708000.969406504)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708001.186713569)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708001.627180494)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708061.073973538)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708061.262389325)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708061.261271484)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708062.55101285)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708062.280350745)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708062.077377724)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708061.112107022)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708062.749191076)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708062.47909932)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708061.38407508)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708061.297749775)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708061.135461813)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708060.834696565)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708062.072995817)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708061.860071916)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708062.220770877)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708061.218117883)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708061.690582514)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708061.98254193)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708061.107486652)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708060.949165208)])']\n", "connector: \n", "\n", "Evaluating workflow: 2%|▏ | 1/50 [00:01<01:18, 1.61s/it]\u001b[A\n", "Evaluating workflow: 10%|█ | 5/50 [00:01<00:11, 3.77it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:18:27.806\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5abdd0f15542991f6610604d', 'answer': 'a failed coup attempt', 'question': 'The Rome Protocols were signed by three Prime Ministers one of which was assassinated as part of what?', 'supporting_facts': [['Rome Protocols', 0], ['Rome Protocols', 1], ['Engelbert Dollfuss', 0], ['Engelbert Dollfuss', 4]], 'context': [['Indian general election, 1996', ['General elections were held in India in 1996 to elect the members of the 11th Lok Sabha contested by the Congress Party and Bharatiya Janata Party.', ' The result of the election was a hung parliament with neither top two leading securing a mandate.', ' The Bharatiya Janata Party formed a short lived government.', ' United Front, consisting of non Congress, non BJP was created and secured support from 332 members out of the 545 seats in the Lok Sabha, resulting in H.D. Deve Gowda from the Janata Dal being the 11th Prime Minister of India.', ' The 11th Lok Sabha produced three Prime Ministers in two years and forced the country back to the polls in 1998.']], ['Seaford (UK Parliament constituency)', ['The UK parliamentary constituency of Seaford was a Cinque Port constituency, similar to a parliamentary borough, in Seaford, East Sussex.', ' A rotten borough, prone by size to undue influence by a patron, it was disenfranchised in the Reform Act of 1832.', ' It was notable for having returned three Prime Ministers as its members – Henry Pelham, who represented the town from 1717 to 1722, William Pitt the Elder from 1747 to 1754 and George Canning in 1827 – though only Canning was Prime Minister while representing Seaford.']], ['List of Japanese prime ministers by longevity', ['This is a list of Japanese prime ministers by longevity.', ' It consists of Prime Ministers and Interim Prime Ministers of Japan who have held the office.', ' If a Prime Minister served more than one non-consecutive term, the dates given are for the beginning of their first term, and the end of their last term.']], ['Rome Protocols', ['The Rome Protocols were a series of three international agreements signed in Rome on 17 March 1934 between the governments of Austria, Hungary and Italy.', ' They were signed by Italian Prime Minister Benito Mussolini, Austrian Prime Minister Engelbert Dollfuss and Hungarian Prime Minister Gyula Gömbös.', ' All the three protocols went into effect on 12 July 1934 and were registered in \"League of Nations Treaty Series\" on 12 December 1934.']], ['Yehuda Avner', ['Yehuda Avner (Hebrew: יהודה אבנר\\u200e ; December 30, 1928 – March 24, 2015) was an Israeli prime ministerial advisor, diplomat, and author.', ' He served as Speechwriter and Secretary to Israeli Prime Ministers Golda Meir and Levi Eshkol, and as Advisor to Israeli Prime Ministers Yitzhak Rabin, Menachem Begin, and Shimon Peres.', ' Avner served in diplomatic positions at the Israeli Consulate in New York, and the Israeli Embassy to the US in Washington, DC, and as Israel’s Ambassador to Britain, Ireland and Australia.', ' In 2010, he turned his insider stories about Israeli politics and diplomacy into a bestselling book, \"The Prime Ministers\", which subsequently became the basis for a two-part documentary movie.', ' In 2015, his novel, \"The Ambassador\", which Avner co-authored with thriller writer Matt Rees, was posthumously published.']], [\"Commonwealth Prime Ministers' Conference\", [\"Commonwealth Prime Ministers' Conference were biennial meetings of Prime Ministers of the United Kingdom and the Dominion members of the British Commonwealth of Nations.\", \" Seventeen Commonwealth Prime Ministers' Conferences were held between 1944 and 1969.\", ' As well, the prime ministers met for a Commonwealth Economic Conference in 1952.', ' These series of conferences were a continuation and regularisation of the earlier Imperial Conferences which had been held periodically from 1887 to 1937.', ' Since 1971, Commonwealth Heads of Government Meetings have been held.']], ['Herb Gray', ['Herbert Eser \"Herb\" Gray, {\\'1\\': \", \\'2\\': \", \\'3\\': \", \\'4\\': \"} (May 25, 1931 – April 21, 2014) was a Canadian politician and statesman.', ' He served as a Member of Parliament for four decades.', ' He also served as cabinet minister under three prime ministers, and as Deputy Prime Minister from 1997 to 2002.', \" He was Canada's first Jewish federal cabinet minister.\", ' He is one of few Canadians granted the honorific \"The Right Honourable\" who was not so entitled by virtue of a position held.']], ['List of Prime Ministers of Israel by longevity', ['This is a list of Israel Prime Ministers, in order of longevity.', ' This list includes Prime ministers and \"acting\" Prime ministers.', ' There are currently thirteen Prime Ministers on the list and three living Prime Ministers.', ' The list is in descending order and is correct as of none }} .']], ['List of Prime Ministers of Canada by constituency', ['The following list indicates ridings represented by Canadian Prime Ministers during their term(s) of office.', ' Some Prime Ministers represented more than one constituency during their term(s), hence the tallied numbers exceed the number of Prime Ministers.', ' Moreover, one Prime Minister - Sir Mackenzie Bowell - served his term while a member of the Senate, although he had previously been a member of the House of Commons from Ontario.']], ['Engelbert Dollfuss', ['Engelbert Dollfuss (German: \"Engelbert Dollfuß\" , ] ; 4 October 1892 – 25 July 1934) was an Austrian Christian Social and Patriotic Front statesman.', ' Having served as Minister for Forests and Agriculture, he ascended to Federal Chancellor in 1932 in the midst of a crisis for the conservative government.', ' In early 1933, he shut down parliament, banned the Austrian Nazi party and assumed dictatorial powers.', ' Suppressing the Socialist movement in February 1934, he cemented the rule of “austrofascism” through the authoritarian \"First of May Constitution\".', ' Dollfuss was assassinated as part of a failed coup attempt by Nazi agents in 1934.', \" His successor Kurt Schuschnigg maintained the regime until Adolf Hitler's annexation of Austria in 1938.\"]]], 'type': 'bridge', 'level': 'hard'}\n", "Error: The input to LLMOutputParser.parse should be a str, but found .\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 18%|█▊ | 9/50 [00:01<00:05, 7.49it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 24%|██▍ | 12/50 [00:02<00:04, 7.86it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 30%|███ | 15/50 [00:02<00:04, 7.44it/s]\u001b[A\n", "Evaluating workflow: 34%|███▍ | 17/50 [00:02<00:04, 8.09it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 38%|███▊ | 19/50 [00:02<00:03, 9.16it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 44%|████▍ | 22/50 [00:03<00:02, 10.88it/s]\u001b[A\n", "Evaluating workflow: 48%|████▊ | 24/50 [00:03<00:02, 11.32it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 52%|█████▏ | 26/50 [00:03<00:02, 10.59it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:18:29.648\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a72a2935542991f9a20c546', 'answer': 'Velvetpark', 'question': 'Is Velvetpark or Shape magazine written more for a lesbian and queer-identified female readership?', 'supporting_facts': [['Velvetpark', 1], ['Shape (magazine)', 1], ['Shape (magazine)', 7]], 'context': [['Jeguk Sinmun', ['The Jeguk Sinmun (\"Imperial Post\"; 1898-1910) was a Seoul-based Korean language newspaper founded in 1898 by Yi Jong-myeon.', ' It was published using the purely vernacular Hangeul script and attracted a largely lower or middle class and female readership.', ' It was less political than the other papers of the period, concentrating instead on social issues.', ' One of its early reporters was the young Syngman Rhee.']], ['Velvetpark', ['Velvetpark: Dyke Culture in Bloom is a lesbian and feminist arts and culture website that regularly features music, literature, theater, fine arts, film, television, and social activism as it impacts queer culture.', ' \"Velvetpark\" also hosts a social network and dating community for lesbians and queer-identified women.']], ['Anna Kalata', ['Anna Kalata (born May 10, 1964, Milanówek, Poland) is a Polish politician, celebrity and occasional actress.', ' She was a member of the populist Samoobrona party.', \" In Jarosław Kaczyński's cabinet she was the minister of labour and social policy.\", ' She participated in the 12th season of Taniec z Gwiazdami (the Polish version of Dancing With The Stars).', ' After losing 38 kg she appeared on the cover of Shape magazine.']], ['Shape (magazine)', [\"Shape is a monthly English language fitness magazine started by Weider Publications in 1981, founded by Christine MacIntyre (a pioneer in women's free weight fitness) and became the number one women's fitness magazine.\", ' At that time, Weider Enterprises consisted primarily of the bodybuilding magazine \"Muscle & Fitness\".', ' Joe Weider and Christine MacIntyre had differing views of how to present \"Shape\", Weider endorsing a less journalistic and more commercial approach to articles, MacIntyre endorsing a more academic, doctor-based magazine.', ' Weider also endorsed a sexier approach to editorial while MacIntyre endorsed a healthier look for women, eschewing sexiness in the models and the copy.', ' MacIntyre largely won that battle, editing a magazine that required that every byline have an advanced medical degree, that cover models should look healthy rather than sexy, and that sexist language be avoided.', ' Christine MacIntyre was the editor-in-chief until her death in 1988.', ' Tara Kraft is the current editor-in-chief.', ' \"Shape\" found a readership based on that formula.']], ['Cynthia Heimel', ['Cynthia Heimel (née Glick) (born 1947 in Philadelphia) is a feminist humorist writer from Oakland, California.', ' She is a columnist and the author of satirical books primarily aimed at a female readership and known for their unusual titles, as well as a playwright and television writer.']], ['Femme', ['Femme is a lesbian sexual identity that was created in the working class lesbian bar culture of the 1950s.', ' It is a term used to distinguish feminine lesbian and bisexual women from their butch/stud lesbian counterparts and partners.', ' Today the term is still used in this way but in recent years - following the influence of Queer gender identity theories - its meaning has, sometimes contentiously, been expanded to describe a queer-identified person who is feminine in their presentation regardless of their gender or sexuality.']], ['Chapstick lesbian', ['A chapstick lesbian is a sub-group within lesbianism that Ellen DeGeneres popularised in 1997 in her show \"Ellen\".', ' It was originally constructed as response to the phrase \"lipstick lesbian\" that emerged in 1990, which refers to a femme lesbian who emphasises their female identity through their self-presentation.', ' The slang term \"chapstick lesbian\" identifies a category on the femme-butch lesbian continuum, where the female homosexual has a gender identity bias towards femme lesbianism, although does not identify or fit the criteria of being a lipstick lesbian.', ' The word is frequently used as an alternative to the term \"soft-butch\" lesbian or androgynous.', ' The key attributes recognisable of a chapstick lesbian is that they have a casual dress-code and lack of desire to wear make-up.', ' Next to this, they are also viewed as being athletic in nature and have a notable interest in sport.']], ['Elana Amsterdam', ['Elana Amsterdam is the New York Times Bestselling author of \"Paleo Cooking from Elana\\'s Pantry\".', ' She writes cookbooks for gluten-free cooking, using almond flour and coconut flour as a gluten-free alternative to wheat flour.', ' Her book, \"The Gluten-Free Almond Flour Cookbook\", was named one of the \"Best Cookbooks of 2009\" by The Denver Post.', ' Amsterdam has partnered with the California Almond Board in conjunction with her works.', ' Her blog, elanaspantry.com, was named one of the top 50 food blogs by Cision.', \" Amsterdam contributed an article to Shape Magazine and she was featured on Fox News's On the Hunt with Jonathan Hunt.\"]], ['Shōjo manga', ['Shōjo, shojo, or shoujo manga (少女漫画 , shōjo manga ) is manga aimed at a teenage female readership.', ' The name romanizes the Japanese 少女 (shōjo), literally \"young woman\".', ' Shōjo manga covers many subjects in a variety of narrative styles, from historical drama to science fiction, often with a focus on romantic relationships or emotions.', ' Strictly speaking, however, shōjo manga does not comprise a style or genre, but rather indicates a target demographic.']], ['Celesbian', ['The term celesbian (a portmanteau of \"celebrity\" and \"lesbian\") originally referred to a female celebrity known or reputed to be a lesbian and popular within the LGBT community.', ' Celesbianism as a Western media phenomenon came into vogue in 2008, when several female celebrities presented themselves as lesbians.', ' The term was first used by New Yorkers Pam Franco and Susan Levine, a disk jockey.', ' It was used in a full-page ad in a lesbian nightlife magazine, \"GO MAGAZINE\".', ' The ad was for the Mz Hip and Fit NY contest, the idea of Denise Cohen of Denco Designs & Events.', ' The contest was a search for the hottest lesbian in the United States.', ' The term \"celesbian\" was used for the celebrity lesbian judges.']]], 'type': 'comparison', 'level': 'hard'}\n", "Error: The input to LLMOutputParser.parse should be a str, but found .\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 56%|█████▌ | 28/50 [00:03<00:01, 11.04it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 60%|██████ | 30/50 [00:03<00:01, 10.30it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.888888888888889, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 64%|██████▍ | 32/50 [00:04<00:02, 8.16it/s]\u001b[A\n", "Evaluating workflow: 68%|██████▊ | 34/50 [00:04<00:01, 9.77it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.3333333333333333, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 76%|███████▌ | 38/50 [00:04<00:00, 14.18it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 84%|████████▍ | 42/50 [00:04<00:00, 17.75it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 90%|█████████ | 45/50 [00:05<00:00, 12.69it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 94%|█████████▍| 47/50 [00:05<00:00, 10.72it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 100%|██████████| 50/50 [00:06<00:00, 7.85it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:18:32.359\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m275\u001b[0m - \u001b[1mStep 18 metrics: {'f1': 0.7604828042328041, 'em': 0.625, 'acc': 0.75}\u001b[0m\n", "\u001b[32m2025-12-07 12:18:32.359\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m291\u001b[0m - \u001b[1mMetrics are worse than the best snapshot which has {'f1': 0.7765468091998704, 'em': 0.6530612244897959, 'acc': 0.7142857142857143}. Rolling back to the best snapshot.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "\r", " 90%|█████████ | 18/20 [20:05<02:13, 66.95s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:18:32.363\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m319\u001b[0m - \u001b[1mExecuting workflow...\u001b[0m\n", "\u001b[32m2025-12-07 12:18:43.982\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m347\u001b[0m - \u001b[1mComputing gradients...\u001b[0m\n", "\u001b[32m2025-12-07 12:19:32.428\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m349\u001b[0m - \u001b[1mUpdating agents...\u001b[0m\n", "\u001b[32m2025-12-07 12:19:35.197\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m353\u001b[0m - \u001b[1mAgents updated\u001b[0m\n", "\u001b[32m2025-12-07 12:19:35.198\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m271\u001b[0m - \u001b[1mEvaluating the workflow at step 19 ...\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 0%| | 0/50 [00:00.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 90%|█████████ | 45/50 [00:04<00:00, 17.21it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 96%|█████████▌| 48/50 [00:05<00:00, 6.84it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 100%|██████████| 50/50 [00:10<00:00, 4.68it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:19:45.970\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m275\u001b[0m - \u001b[1mStep 19 metrics: {'f1': 0.7448412698412697, 'em': 0.5918367346938775, 'acc': 0.7142857142857143}\u001b[0m\n", "\u001b[32m2025-12-07 12:19:45.971\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m291\u001b[0m - \u001b[1mMetrics are worse than the best snapshot which has {'f1': 0.7765468091998704, 'em': 0.6530612244897959, 'acc': 0.7142857142857143}. Rolling back to the best snapshot.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "\r", " 95%|█████████▌| 19/20 [21:18<01:08, 68.95s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:19:45.974\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m319\u001b[0m - \u001b[1mExecuting workflow...\u001b[0m\n", "\u001b[32m2025-12-07 12:19:57.148\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m347\u001b[0m - \u001b[1mComputing gradients...\u001b[0m\n", "\u001b[32m2025-12-07 12:20:52.167\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m349\u001b[0m - \u001b[1mUpdating agents...\u001b[0m\n", "\u001b[32m2025-12-07 12:20:54.713\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mstep\u001b[0m:\u001b[36m353\u001b[0m - \u001b[1mAgents updated\u001b[0m\n", "\u001b[32m2025-12-07 12:20:54.714\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m271\u001b[0m - \u001b[1mEvaluating the workflow at step 20 ...\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 0%| | 0/50 [00:00.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 16%|█▌ | 8/50 [00:01<00:05, 7.27it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 20%|██ | 10/50 [00:01<00:04, 8.05it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 24%|██▍ | 12/50 [00:01<00:04, 8.24it/s]\u001b[A\n", "Evaluating workflow: 28%|██▊ | 14/50 [00:02<00:03, 9.95it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 32%|███▏ | 16/50 [00:02<00:03, 9.98it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 38%|███▊ | 19/50 [00:02<00:02, 11.09it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 44%|████▍ | 22/50 [00:02<00:01, 14.22it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 50%|█████ | 25/50 [00:02<00:01, 15.96it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.08333333333333333, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 54%|█████▍ | 27/50 [00:02<00:01, 15.10it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 58%|█████▊ | 29/50 [00:03<00:02, 9.64it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:20:58.150\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a72a2935542991f9a20c546', 'answer': 'Velvetpark', 'question': 'Is Velvetpark or Shape magazine written more for a lesbian and queer-identified female readership?', 'supporting_facts': [['Velvetpark', 1], ['Shape (magazine)', 1], ['Shape (magazine)', 7]], 'context': [['Jeguk Sinmun', ['The Jeguk Sinmun (\"Imperial Post\"; 1898-1910) was a Seoul-based Korean language newspaper founded in 1898 by Yi Jong-myeon.', ' It was published using the purely vernacular Hangeul script and attracted a largely lower or middle class and female readership.', ' It was less political than the other papers of the period, concentrating instead on social issues.', ' One of its early reporters was the young Syngman Rhee.']], ['Velvetpark', ['Velvetpark: Dyke Culture in Bloom is a lesbian and feminist arts and culture website that regularly features music, literature, theater, fine arts, film, television, and social activism as it impacts queer culture.', ' \"Velvetpark\" also hosts a social network and dating community for lesbians and queer-identified women.']], ['Anna Kalata', ['Anna Kalata (born May 10, 1964, Milanówek, Poland) is a Polish politician, celebrity and occasional actress.', ' She was a member of the populist Samoobrona party.', \" In Jarosław Kaczyński's cabinet she was the minister of labour and social policy.\", ' She participated in the 12th season of Taniec z Gwiazdami (the Polish version of Dancing With The Stars).', ' After losing 38 kg she appeared on the cover of Shape magazine.']], ['Shape (magazine)', [\"Shape is a monthly English language fitness magazine started by Weider Publications in 1981, founded by Christine MacIntyre (a pioneer in women's free weight fitness) and became the number one women's fitness magazine.\", ' At that time, Weider Enterprises consisted primarily of the bodybuilding magazine \"Muscle & Fitness\".', ' Joe Weider and Christine MacIntyre had differing views of how to present \"Shape\", Weider endorsing a less journalistic and more commercial approach to articles, MacIntyre endorsing a more academic, doctor-based magazine.', ' Weider also endorsed a sexier approach to editorial while MacIntyre endorsed a healthier look for women, eschewing sexiness in the models and the copy.', ' MacIntyre largely won that battle, editing a magazine that required that every byline have an advanced medical degree, that cover models should look healthy rather than sexy, and that sexist language be avoided.', ' Christine MacIntyre was the editor-in-chief until her death in 1988.', ' Tara Kraft is the current editor-in-chief.', ' \"Shape\" found a readership based on that formula.']], ['Cynthia Heimel', ['Cynthia Heimel (née Glick) (born 1947 in Philadelphia) is a feminist humorist writer from Oakland, California.', ' She is a columnist and the author of satirical books primarily aimed at a female readership and known for their unusual titles, as well as a playwright and television writer.']], ['Femme', ['Femme is a lesbian sexual identity that was created in the working class lesbian bar culture of the 1950s.', ' It is a term used to distinguish feminine lesbian and bisexual women from their butch/stud lesbian counterparts and partners.', ' Today the term is still used in this way but in recent years - following the influence of Queer gender identity theories - its meaning has, sometimes contentiously, been expanded to describe a queer-identified person who is feminine in their presentation regardless of their gender or sexuality.']], ['Chapstick lesbian', ['A chapstick lesbian is a sub-group within lesbianism that Ellen DeGeneres popularised in 1997 in her show \"Ellen\".', ' It was originally constructed as response to the phrase \"lipstick lesbian\" that emerged in 1990, which refers to a femme lesbian who emphasises their female identity through their self-presentation.', ' The slang term \"chapstick lesbian\" identifies a category on the femme-butch lesbian continuum, where the female homosexual has a gender identity bias towards femme lesbianism, although does not identify or fit the criteria of being a lipstick lesbian.', ' The word is frequently used as an alternative to the term \"soft-butch\" lesbian or androgynous.', ' The key attributes recognisable of a chapstick lesbian is that they have a casual dress-code and lack of desire to wear make-up.', ' Next to this, they are also viewed as being athletic in nature and have a notable interest in sport.']], ['Elana Amsterdam', ['Elana Amsterdam is the New York Times Bestselling author of \"Paleo Cooking from Elana\\'s Pantry\".', ' She writes cookbooks for gluten-free cooking, using almond flour and coconut flour as a gluten-free alternative to wheat flour.', ' Her book, \"The Gluten-Free Almond Flour Cookbook\", was named one of the \"Best Cookbooks of 2009\" by The Denver Post.', ' Amsterdam has partnered with the California Almond Board in conjunction with her works.', ' Her blog, elanaspantry.com, was named one of the top 50 food blogs by Cision.', \" Amsterdam contributed an article to Shape Magazine and she was featured on Fox News's On the Hunt with Jonathan Hunt.\"]], ['Shōjo manga', ['Shōjo, shojo, or shoujo manga (少女漫画 , shōjo manga ) is manga aimed at a teenage female readership.', ' The name romanizes the Japanese 少女 (shōjo), literally \"young woman\".', ' Shōjo manga covers many subjects in a variety of narrative styles, from historical drama to science fiction, often with a focus on romantic relationships or emotions.', ' Strictly speaking, however, shōjo manga does not comprise a style or genre, but rather indicates a target demographic.']], ['Celesbian', ['The term celesbian (a portmanteau of \"celebrity\" and \"lesbian\") originally referred to a female celebrity known or reputed to be a lesbian and popular within the LGBT community.', ' Celesbianism as a Western media phenomenon came into vogue in 2008, when several female celebrities presented themselves as lesbians.', ' The term was first used by New Yorkers Pam Franco and Susan Levine, a disk jockey.', ' It was used in a full-page ad in a lesbian nightlife magazine, \"GO MAGAZINE\".', ' The ad was for the Mz Hip and Fit NY contest, the idea of Denise Cohen of Denco Designs & Events.', ' The contest was a search for the hottest lesbian in the United States.', ' The term \"celesbian\" was used for the celebrity lesbian judges.']]], 'type': 'comparison', 'level': 'hard'}\n", "Error: The input to LLMOutputParser.parse should be a str, but found .\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 62%|██████▏ | 31/50 [00:03<00:01, 10.34it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 66%|██████▌ | 33/50 [00:03<00:01, 9.88it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.888888888888889, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 70%|███████ | 35/50 [00:03<00:01, 11.52it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 76%|███████▌ | 38/50 [00:03<00:00, 13.09it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 82%|████████▏ | 41/50 [00:04<00:00, 15.51it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 88%|████████▊ | 44/50 [00:04<00:00, 12.03it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 92%|█████████▏| 46/50 [00:04<00:00, 10.92it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 96%|█████████▌| 48/50 [00:04<00:00, 10.85it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 100%|██████████| 50/50 [00:05<00:00, 8.89it/s]\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.17391304347826084, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:21:00.436\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m275\u001b[0m - \u001b[1mStep 20 metrics: {'f1': 0.7236794053370139, 'em': 0.6041666666666666, 'acc': 0.7291666666666666}\u001b[0m\n", "\u001b[32m2025-12-07 12:21:00.436\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m291\u001b[0m - \u001b[1mMetrics are worse than the best snapshot which has {'f1': 0.7765468091998704, 'em': 0.6530612244897959, 'acc': 0.7142857142857143}. Rolling back to the best snapshot.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "100%|██████████| 20/20 [22:33<00:00, 67.66s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:21:00.440\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m300\u001b[0m - \u001b[1mReached the maximum number of steps 20. Optimization has finished.\u001b[0m\n", "\u001b[32m2025-12-07 12:21:00.440\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.workflow.workflow_graph\u001b[0m:\u001b[36msave_module\u001b[0m:\u001b[36m1201\u001b[0m - \u001b[1mSaving SequentialWorkFlowGraph to ./HotPotQASplits_textgrad_final.json\u001b[0m\n", "\u001b[32m2025-12-07 12:21:00.442\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.workflow.workflow_graph\u001b[0m:\u001b[36msave_module\u001b[0m:\u001b[36m1201\u001b[0m - \u001b[1mSaving SequentialWorkFlowGraph to ./HotPotQASplits_textgrad_best.json\u001b[0m\n", "\u001b[32m2025-12-07 12:21:00.443\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.textgrad_optimizer\u001b[0m:\u001b[36mrestore_best_graph\u001b[0m:\u001b[36m448\u001b[0m - \u001b[1mRestored the best graph from snapshot with metrics {'f1': 0.7765468091998704, 'em': 0.6530612244897959, 'acc': 0.7142857142857143}\u001b[0m\n", "\u001b[32m2025-12-07 12:21:00.444\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m10\u001b[0m - \u001b[1mEvaluating workflow on test set...\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "Evaluating workflow: 1%|▏ | 7/500 [00:01<01:14, 6.66it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8571428571428571, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 3%|▎ | 13/500 [00:01<00:51, 9.45it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.75, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 3%|▎ | 15/500 [00:02<00:47, 10.27it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 3%|▎ | 17/500 [00:06<05:37, 1.43it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 4%|▍ | 19/500 [00:07<04:30, 1.78it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.18181818181818182, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:07.739\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 16 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 5%|▍ | 24/500 [00:07<02:17, 3.47it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.3333333333333333, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.3333333333333333, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:21:11.720\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5ae199305542997b2ef7d20e', 'answer': 'Lacoste, France', 'question': 'In what european city is a location of the college from which the woman known as Comic Book Girl 19 received her degree?', 'supporting_facts': [['Comic Book Girl 19', 0], ['Comic Book Girl 19', 1], ['Savannah College of Art and Design', 0]], 'context': [['Comic book convention', ['A comic book convention or comic con is an event with a primary focus on comic books and comic book culture, in which comic book fans gather to meet creators, experts, and each other.', ' Commonly, comic conventions are multi-day events hosted at convention centers, hotels, or college campuses.', ' They feature a wide variety of activities and panels, with a larger number of attendees participating in cosplay than most other types of fan conventions. Comic book conventions are also used as a vehicle for industry, in which publishers, distributors, and retailers represent their comic-related releases.', ' Comic book conventions may be considered derivatives of science-fiction conventions, which began in the late 1930s.']], ['Thomas Sieverts', ['Thomas Sieverts (born 1934) is a German architect and urban planner.', ' He is the author of \"Zwischenstadt\" (1997; first published in English in 2000 as \"Cities without Cities: An interpretation of the Zwischenstadt\"), a book which addresses the decentralization of the compact historical European city and examines the new form of urbanity which has spread across the world describable as the urbanised landscape or the landscaped city.', ' Sieverts calls this the \"Zwischenstadt\", or \"in-between city\", as it exists between old historical city centres and open countrysides, between place as a living space and the non-places of movement, between small local economic cycles and the dependency on the world market.', ' In 2008 a group calling itself \"suddenly\" commissioned the American writer Diana George to make a new translation of \"Zwischenstadt\" which they published as \"Where We Live Now\" (the English phrase George chose as the translation of Sieverts\\'s neologism \"Zwischenstadt\").', ' In October 2008, Sieverts came to Portland, Oregon, on the occasion of the book\\'s publication to take part in a week-long symposium about his work, also called \"suddenly\".']], ['Sheena, Queen of the Jungle', ['Sheena, Queen of the Jungle is a fictional American comic book jungle girl heroine, originally published primarily by Fiction House.', ' She was the first female comic book character with her own title, with her 1937 (in Great Britain; 1938 in the United States) premiere preceding \"Wonder Woman\" #1 (cover-dated Dec. 1941).', ' Sheena inspired a wealth of similar comic book jungle queens.', ' She was predated in literature by Rima, the Jungle Girl, introduced in the 1904 William Henry Hudson novel \"Green Mansions\".', ' Sheena was ranked 59th in \"Comics Buyer\\'s Guide\"s \"100 Sexiest Women in Comics\" list.']], ['The Architecture of the City', ['The Architecture of the City (Italian: \"L\\'architettura della città\" ) is a seminal book of urban design theory by the Italian architect Aldo Rossi published in Padova in 1966.', ' The book marks the shift from the urban doctrines of modernism to a rediscovery of the traditional European city.']], ['Bratslav', ['Bratslav (Ukrainian: Брацлав ; Polish: \"Bracław\" ; Yiddish: בראָסלעוו\\u200e , \"Broslev\", today also pronounced Breslev or \"Breslov\" as the name of a Hasidic group, which originated from this town) is an urban-type settlement in Ukraine, located in Nemyriv Raion of Vinnytsia Oblast, by the Southern Bug river.', ' It is a medieval European city and a regional center of the Eastern Podolia region (see Bratslav Voivodeship) founded by government of the Crown of the Kingdom of Poland, which dramatically lost its importance during the 19th-20th centuries.', ' Population: \\u2009(2015 est.)']], ['Microcosm: Portrait of a Central European City', ['Microcosm: Portrait of a Central European City is a 2002 book by historians Norman Davies and Roger Moorhouse about the history of Wrocław, the largest city in western Poland.']], ['Metropolis Collectibles', ['Metropolis Collectibles is a famous rare comic book dealer of vintage American comics, primarily known for its large collection of comic books originally published in the 1930s, 1940s, 1950s, 1960s and 1970s.', \" Metropolis was founded in 1984 by Stephen Fishler, and merged companies in 1999 with Vincent Zurzolo, Jr., of Vincent's Collectibles.; Zurzolo said that as he found he could not compete with Fishler's business, merging the two made sense.\", ' The company is located on Broadway in New York City, and the comic book showroom allows viewings by appointment only.', ' Over the years, Metropolis Collectibles has grown from being a comic-book mail-order company to maintaining a major online retail presence.', ' In addition to being comic book buyers and comic book sellers, Metropolis also gives comic book appraisals and provides comic book valuation services of rare, old out-of-print comics.', ' Metropolis Collectibles has obtained a variety of notable classic comic book collections over the years, or \"pedigrees\", including the Crowley Collection, the Allentown Pedigree, the D-Copy Collection, and the Northford Collection.', ' In August 2014, the company was able to purchase a near-mint copy of \"Action Comics #1\" (CGC 9.0) for $3.2 million in an auction on eBay.']], ['Savannah College of Art and Design', ['Savannah College of Art and Design (SCAD), is a private, nonprofit, accredited university with locations in Savannah, Georgia; Atlanta, Georgia; Hong Kong; and Lacoste, France.']], ['Parks and open spaces in Birmingham', ['Birmingham has 571 parks totalling over 3500 ha of public open space, more than any other equivalent sized European city.', \" The centrepieces of Birmingham's park system are the five Premier Parks.\", ' Ten parks have received the prestigious Green Flag Award.', ' The city also has five local nature reserves, one national nature reserve and a number of Wildlife Trust nature reserves.']], ['Comic Book Girl 19', ['Danika Lee Massey, also known as Comic Book Girl 19 or CBG19, is a YouTube personality known for her commentaries on comics, films, books, and television shows.', ' She has a degree in sequential art from the Savannah College of Art and Design.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 5%|▌ | 26/500 [00:11<06:12, 1.27it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:21:12.377\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 11 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:12.382\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 11 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 6%|▌ | 28/500 [00:11<04:42, 1.67it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:12.425\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 11 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:12.608\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a716ec85542994082a3e82d', 'answer': \"You're Next\", 'question': 'Which movie was filmed first \"The Guest\" or \"You\\'re Next\"?', 'supporting_facts': [['The Guest (film)', 0], [\"You're Next\", 0]], 'context': [['K. Ravindran Nair', ['K. Ravindran Nair was born in a rich family, dealing with cashew production and exports.', ' His passion for literature and arts brought him into Malayalam cinema and in 1967, he established \"General Pictures\" under the banner of which he produced his first movie, Anweshichu Kandethiyilla, directed by P. Bhaskaran.', ' This was followed by two more films the next year, Kattukurangu and Lakshaprabhu, both directed by Bhaskaran.', ' Ravi, as he is generally known, was silent for the next few years till he came out with his next film, Achani, an A. Vincent movie, in 1973, which earned him the moniker, \"Achani Ravi\".', ' The film was reported to be a commercial success like his earlier films and Ravi is known to have contributed the returns from the movie for building a Public Library in Kollam, of which he is a founder member and honorary secretary.']], ['The White Hell of Pitz Palu', ['The White Hell of Pitz Palu (German: \"Die weisse Hölle vom Piz Palü\" ) is a 1929 German silent mountain film co-directed by Arnold Fanck and Georg Wilhelm Pabst and starring Leni Riefenstahl, Gustav Diessl, Ernst Petersen, and World War I flying ace Ernst Udet.', ' Written by Arnold Fanck and Ladislaus Vajda, the film is about a man who loses his wife in an avalanche while climbing the Piz Palü mountain, and spends the next few years searching the mountain alone for her body.', ' Four years later he meets a young couple who agree to accompany him on his next climb.', ' \"The White Hell of Pitz Palu\" was filmed on location in the Bernina Range in Graubünden, Switzerland.', ' The 1929 theatrical release starred Kurt Gerron, who was Jewish, as a night club guest.', ' The film was edited to remove scenes featuring Gerron, and it was rereleased as a 90-minute German-language sound film in 1935.', ' It was remade in 1950.']], ['Asturian cinema', ['The Asturian Cinema (or \"Asturian National Cinema\") in Asturias, Spain, began in 1905 with the production of the first Asturian fiction film known as \"El robo de fruta\" (The fruit robbery).', ' It was filmed by the Asturian film maker Javier Sánchez Manteola.', ' This movie was filmed in Gijón (Asturias) and premiered in that city in the same year.', ' It was shown in the old movie theater known as Salón Luminoso once located at Begoña Walk.', ' In commemoration of this even the Asturian Film Festival was established in the year 2005.']], ['The Guest (film)', ['The Guest is a 2014 American action horror-thriller film directed and edited by Adam Wingard and written by Simon Barrett, both of whom previously collaborated on a previous film, \"You\\'re Next\" (2011).', ' Starring Dan Stevens, Maika Monroe, Leland Orser, Sheila Kelley, Brendan Meyer, and Lance Reddick, the plot follows a soldier named \"David\" unexpectedly visiting the Peterson family, introducing himself as a friend of their son who had died during the Afghanistan war.', ' After the man is welcomed into their home for a couple of days, a series of deaths begin to occur around his presence, and their daughter Anna begins to have suspicions of David being connected to the deaths.']], ['Stephen Nicholas (actor)', ['Stephen Nicholas (born 23 August 1978) also known as Stephen Charles Nicholas is an actor and presenter from Doncaster, South Yorkshire, England.', \" Stephen currently lives in Sheffield, his first role was on Sky One's Dream Team, where he played Scott Ward.\", ' From there, he filmed the first in the trilogy Goal!', ' (In which he played a Newcastle United Reserves player).', ' Following this, he moved to Los Angeles, where he played Smith in the feature film Futbaal: The Price of Dreams.', ' Stephen then returned to the UK to make a Bollywood film called Dhana Dhana Goal with John Abraham.', ' Stephen then experienced his first opportunity in reality TV with the show Premier League All Stars for Sky One, as well as playing a footballer, he was on-hand to present celebrity gossip and pitch side reports.', ' He then appeared in Celebrity Most Haunted and Date the Enemy.', ' From there he then went on to star in Goal 3 where he not only acted in the film he also became the football choreographer and choreographed all the football scenes in the film.', ' Nicholas then starred in the film Damned United where he played Welsh international Alan Durban, the film was filmed in Chesterfield and Leeds and was directed by Oscar winner Tom Hooper and also starred Oscar nominated Michael Sheen.', \" Stephens next production was the feature film called 'No Way Back Now'about the notorious Manchester district of Moss Side, where Stephen played the lead actor Stuart Gavin,The feature is roughly based on the notorious Gooch gang that terrorised Manchester throughout the years.\", ' The next move for Stephen was pantomime where he was part of the production Aladdin over the Christmas period of 2015 in Doncaster playing Abanaza the main villain which he did until January 7, 2016!', '.', \" He has recently been cast in the up-and-coming Feature Film 'Whiteblade' where he will play Thurstan the head Warlord Whiteblade is currently in production and Stephen is shooting his scenes in August 2016.\", \" In September 2016 Stephen will be presenting the Sky TV show 'Britz go Bollywood' the show consists of a group of Celebrities being dressed by The best Indian designers, Stephen is the main presenter of the show which will be screened live September 2, 2016.\"]], [\"Live from Daryl's House\", [\"Live from Daryl's House (simply known as Daryl's House, and often abbreviated as LFDH) is an online series that was first created in fall 2007.\", ' The show features singer-songwriter Daryl Hall performing with his band and various guest artists at his home in Millerton, New York.', ' The show provides a performance space that is an alternative to live concerts and studio sessions for popular artists.', ' This allows the artists to \"…have fun and [be] creatively spontaneous\".', ' The majority of shows include a segment in which Hall and the guest artist prepare food from different cuisines for everyone to eat.', ' The food comes from various local restaurants and the chefs of those establishments walk Hall and guest through the preparation of the food.', ' \"Live From Daryl\\'s House\" expanded to broadcast TV but remained unchanged.', ' Hall was quoted by Billboard.com as saying \"it\\'s an Internet show that is being shown on television, so I\\'m not adapting the show at all in any way to be a \\'TV\\' show.\"', ' The show debuted in 95 markets on September 24, 2011, with back-to-back half-hour episodes featuring Train (Episode 33) and Fitz & the Tantrums (Episode 35).', ' Starting with the 66th episode of \"Live From Daryl\\'s House\", the shows are filmed at Hall\\'s club, Daryl\\'s House, in Pawling, New York.']], [\"You're Next\", [\"You're Next is a 2011 American slasher film directed by Adam Wingard, written by Simon Barrett and starring Sharni Vinson, Nicholas Tucci, Wendy Glenn, A. J. Bowen and Joe Swanberg.\", ' The plot concerns a family under attack by a group of masked assailants during their wedding anniversary getaway.']], ['Rose Marie (1954 film)', ['Rose Marie is a 1954 musical adaptation of the 1924 operetta of the same name, the third to be filmed by Metro-Goldwyn-Mayer, following a 1928 silent movie and the best-known of the three, the 1936 Jeanette MacDonald/Nelson Eddy version.', ' It is directed by Mervyn LeRoy and stars Ann Blyth, Howard Keel and Fernando Lamas.', ' This version is filmed in the Canadian Rockies in CinemaScope.', \" It was MGM's first US produced film in the new widescreen medium (having been preceded by the British made Knights of the Round Table) and the first movie musical of any studio to be released in this format.\"]], ['The Pin Up Girls', ['The Pin Up Girls are a girl group and dance troupe, founded by New York City native Vixen Romeo in 2005, which began as a burlesque-style performance group based in Los Angeles.', \" Performing at Hollywood's most notorious venues such as The Viper Room Key Club and Roxy the girls quickly gained local attention with their girl-on-girl themed, tribal fusion belly dance, burlesque and hip hop routines.\", ' Between 2006-2008 The Pin Up Girls started to become poster girls for the lesbian scene with performances for Curve (magazine), a guest appearance on LOGO network\\'s reality series \"Curl Girls\", a web series segment on AfterEllen, a performance for the LGBT community hosted by Jane Lynch, and performances in Margaret Cho\\'s Sensuous Woman Show.', ' In 2008 The Pin Up Girls first recorded single \"There She Goes...She\\'s Real Fly\" was picked up to be played on Showtime\\'s hit lesbian series \"The L Word\".', ' In 2009 The Pin Up Girls music video, \"There She Goes...She\\'s Real Fly\" premiered on Logo (TV channel) (an MTV network), on New Now Next Pop Lab.', ' The Pin Up Girls\\' \"Girl Candy,\" filmed in N.Y. and L.A., was released in 2011.', ' The Pin Up Girls\\' \"Pretty Things\", featuring actress Elaine Hendrix, was filmed in L.A. by Director Joe LaRue in 2012 and was released in June 2012.']], ['Live from Abbey Road', ['Live from Abbey Road is a 12-part, one-hour performance series/documentary that began filming its first season during 2006 at Abbey Road Studios in London.', ' Season 2 was filmed between 2007 and 2008, season 3 was filmed in 2009 and Season 4 was filmed in 2011.', ' The series features a total of 128 musical artists to date (about 32 per Season) -- usually two or three per show, performing up to five songs per session.', ' The sessions are recorded without a live audience.', ' Filmed in High-Definition with the occasional use of 35 mm lenses, the producers have sought to record performances which \"look like a movie and sound like a record\".']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 11 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 6%|▌ | 30/500 [00:12<03:35, 2.18it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:21:12.679\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5addc79d5542995b365fab7b', 'answer': 'To SquarePants or Not to SquarePants', 'question': 'Which episode of SpongeBob SquarePants aired first, The Clash of Triton or To SquarePants or Not to SquarePants?\"', 'supporting_facts': [['The Clash of Triton', 0], ['The Clash of Triton', 1], ['SpongeBob SquarePants (season 6)', 5]], 'context': [['The SpongeBob SquarePants Movie', ['The SpongeBob SquarePants Movie is a 2004 American live-action/animated comedy film based on the Nickelodeon television series \"SpongeBob SquarePants\".', \" The film was co-written, directed, and co-produced by series creator Stephen Hillenburg and starred the series' cast of Tom Kenny, Bill Fagerbakke, Clancy Brown, Rodger Bumpass and Mr. Lawrence, with guest performances by Scarlett Johansson, Jeffrey Tambor, Alec Baldwin and David Hasselhoff.\", ' It was produced by Hillenburg\\'s production company United Plankton Pictures and Nickelodeon Movies, it was distributed by Paramount Pictures and was also the first film in the \"SpongeBob SquarePants\" film series.', \" In the film, Plankton devises a plan to steal King Neptune's crown and send it to Shell City, and SpongeBob and Patrick must retrieve the crown to save Mr. Krabs from King Neptune's wrath and Bikini Bottom from Plankton's plan.\"]], ['The Clash of Triton', ['\"The Clash of Triton\", also known as \"Neptune\\'s Party\", is the 26th episode of the sixth season and the 126th overall episode of the American animated television series \"SpongeBob SquarePants\".', ' It originally aired on Nickelodeon in the United States on July 5, 2010.']], [\"SpongeBob SquarePants: Plankton's Robotic Revenge\", ['SpongeBob SquarePants: Plankton\\'s Robotic Revenge is an action-adventure video game based on the television series \"SpongeBob SquarePants\".', ' It was released in October 2013 for Wii U, Wii, Nintendo 3DS, Nintendo DS, PlayStation 3, and Xbox 360.', ' The game was developed by Behaviour Interactive and published by Activision, who took over the license from previous \"SpongeBob SquarePants\" video game publisher THQ after the company\\'s bankruptcy and liquidation.']], ['List of SpongeBob SquarePants guest stars', ['In addition to the show\\'s regular cast of voice actors, guest stars have been featured on \"SpongeBob SquarePants\", an American animated television series created by marine biologist and animator Stephen Hillenburg for Nickelodeon.', ' \"SpongeBob SquarePants\" chronicles the adventures and endeavors of the title character and his various friends in the fictional underwater city of Bikini Bottom.', ' Many of the ideas for the show originated in an unpublished, educational comic book titled \"The Intertidal Zone\", which Hillenburg created in the mid-1980s.', ' He began developing \"SpongeBob SquarePants\" into a television series in 1996 upon the cancellation of \"Rocko\\'s Modern Life\", which Hillenburg directed.', ' The pilot episode first aired on Nickelodeon in the United States on May 1, 1999.', ' The show\\'s ninth season premiered in 2012, and episodes of \"SpongeBob SquarePants\" have aired.', ' A feature-length film adaptation of the show, \"The SpongeBob SquarePants Movie\", was released in 2004; in 2015, a sequel, \"\", was released.']], ['SpongeBob SquarePants: Lights, Camera, Pants!', ['SpongeBob SquarePants: Lights, Camera, Pants!', ' is a 2005 party video game based on the TV series \"SpongeBob SquarePants\".', ' It was released in October 2005 for the Xbox, PlayStation 2, GameCube, Game Boy Advance, and the PC.', ' It was released for the Nintendo DS in Korea in 2007, but its North American release was cancelled.', ' It is the first \"SpongeBob SquarePants\" title to feature multiplayer mini-games, similar to the \"Mario Party\" video game series.', ' It is also the last SpongeBob game for the Xbox.', ' It is also the last time Charles Nelson Reilly would voice the Dirty Bubble before his death in 2007.', ' For reasons unknown, Mermaid Man was not voiced by his original voice actor Ernest Borgnine but instead Joe Alaskey, who would voice him again in .']], ['SpongeBob SquarePants 4D: The Great Jelly Rescue', ['SpongeBob SquarePants 4D: The Great Jelly Rescue (often referred to as SpongeBob SquarePants 4D or simply The Great Jelly Rescue) is a 4-D film attraction that serves a sequel to SpongeBob SquarePants 4-D.', ' It follows SpongeBob, Patrick, and Sandy as they go jellyfishing.']], ['SpongeBob SquarePants (season 6)', ['The sixth season of the American animated television series \"SpongeBob SquarePants\", created by former marine biologist and animator Stephen Hillenburg, aired on Nickelodeon from March 3, 2008 to July 5, 2010, and contained 26 episodes, beginning with the episode \"Krabby Road\".', ' The series chronicles the exploits and adventures of the title character and his various friends in the fictional underwater city of Bikini Bottom.', ' The season was executive produced by series creator Hillenburg and supervising producer Paul Tibbitt, who also acted as the showrunner.', ' In 2009, the show celebrated its tenth anniversary on television.', ' The documentary film titled \"\" premiered on July 17, 2009, and marked the anniversary. \"', 'SpongeBob\\'s Truth or Square\", a television film, and the special episode \"To SquarePants or Not to SquarePants\" were broadcast on Nickelodeon, as part of the celebration.']], ['The SpongeBob Movie: Sponge Out of Water', ['The SpongeBob Movie: Sponge Out of Water is a 2015 American 3D live-action/animated comedy film based on the animated television series \"SpongeBob SquarePants\".', ' A stand-alone sequel to \"The SpongeBob SquarePants Movie\" (2004), it was directed by former series showrunner Paul Tibbitt in his directorial debut, with live-action sequences directed by Mike Mitchell.', ' It was the first film to be produced by Paramount Animation and second film in the \"SpongeBob SquarePants\" film series.', \" The film stars Antonio Banderas and features the show's regular voice cast, who returned to reprise their respective roles from the series and the previous film.\", ' The plot follows a pirate called Burger-Beard, who steals the Krabby Patty secret formula using a magical book that makes any text written upon it come true.', ' SpongeBob and his friends must travel to the surface to confront Burger-Beard and get the formula back.']], ['SpongeBob SquarePants 4-D', ['SpongeBob SquarePants 4-D (also known as SpongeBob SquarePants 4-D Ride, SpongeBob SquarePants: The Ride or SpongeBob SquarePants 3-D) is a cel-shaded 4-D film based upon the popular television series \"SpongeBob SquarePants\".', ' It can be found at many aquariums and theme parks across the world.', ' The ride consists of a pre-show which then leads into a stadium seated auditorium.', ' The ride is in 4-D, meaning it is a motion simulator with a 3D movie.', ' The effects on the ride vary at different parks.', ' Water spray, bubbles, wind, leg ticklers, smoke, and smells are usually found.']], ['SpongeBob SquarePants: Original Theme Highlights', ['SpongeBob SquarePants: Original Theme Highlights is the first album of songs played on the Nickelodeon TV series \"SpongeBob SquarePants\".', \" It includes tracks sung by the cartoon's characters: SpongeBob SquarePants, Sandy Cheeks, Patrick Star, Squidward Tentacles, and Plankton.\", ' Its total running time is 9 minutes and 9 seconds, spanning seven tracks.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 11 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "\u001b[32m2025-12-07 12:21:12.688\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5ade25ed5542997c77aded70', 'answer': 'the Cold War (1947–91)', 'question': 'During what war were the Russia-United Kingdom relations in a state of rivalry after the abdication of Emperor Nicholas II? ', 'supporting_facts': [['Russia–United Kingdom relations', 3], ['Russia–United Kingdom relations', 4], ['Russian Revolution', 1]], 'context': [['Grand Duke Vladimir Alexandrovich of Russia', ['Grand Duke Vladimir Alexandrovich of Russia (\"Влади́мир Александрович\") ) (22 April 1847 – 17 February 1909) was a son of Emperor Alexander II of Russia, a brother of Emperor Alexander III of Russia and the senior Grand Duke of the House of Romanov during the reign of his nephew, Emperor Nicholas II.']], ['Grand Duke Sergei Alexandrovich of Russia', ['Grand Duke Sergei Alexandrovich of Russia (\"Сергей Александрович\"; May 11, 1857 – February 17, 1905) was the fifth son and seventh child of Emperor Alexander II of Russia.', \" He was an influential figure during the reigns of his brother Emperor Alexander III of Russia and his nephew Emperor Nicholas II, who was also his brother in law through Sergei's marriage to Elizabeth the sister of Tsarina Alexandra.\"]], ['Charles Sydney Gibbes', ['Charles Sydney Gibbes (19 January 1876 – 24 March 1963) was a British academic who from 1908 to 1917 served as the English tutor to the children of Emperor Nicholas II of Russia.', ' When Nicholas abdicated the throne in March 1917 Gibbes voluntarily accompanied the Imperial family into exile to the Siberian village of Tobolsk.', ' After the family was murdered in 1918 Gibbes returned to the United Kingdom and eventually became an Orthodox monk, adopting the name of \"Nicholas\" in commemoration of Nicholas II.', ' He died in 1963, and is buried at Headington cemetery, Oxford, Oxfordshire, England.']], ['October Manifesto', [\"The October Manifesto (Russian: Октябрьский манифест, Манифест 17 октября ), officially The Manifesto on the Improvement of the State Order (Манифест об усовершенствовании государственного порядка), is a document that served as a precursor to the Russian Empire's first constitution, which would be adopted the next year.\", ' The Manifesto was issued by Emperor Nicholas II, under the influence of Sergei Witte, on 30 October\\xa0[O.S. 17 October]\\xa01905 as a response to the Russian Revolution of 1905.', \" Nicholas strenuously resisted these ideas, but gave in after his first choice to head a military dictatorship, Grand Duke Nicholas, threatened to shoot himself in the head if the Tsar did not accept Witte's suggestion.\", ' Nicholas reluctantly agreed, and issued what became known as the October Manifesto, promising basic civil rights and an elected parliament called the Duma, without whose approval no laws were to be enacted in Russia in the future.', ' According to his memoirs Witte did not force the Tsar to sign the October Manifesto, which was proclaimed in all the churches.']], ['Russia–United Kingdom relations', [\"The Russia–United Kingdom relations (Russian: Российско-британские отношения ) is the relationship between the Russian Federation and the United Kingdom of Great Britain and Northern Ireland and it's overseas territories.\", ' Spanning nearly five centuries, it has often switched from a state of alliance to rivalry or even war.', ' The Russians and British were allies against Napoleon, and enemies in the Crimean War of the 1850s, and rivals in the Great Game for control of central Asia in the late 19th century.', ' They were allies again in World Wars I and II, although relations were strained by the Russian Revolution of 1917.', \" They were at sword's point during the Cold War (1947–91).\", ' Russian big businesses had strong connections with the City of London and British corporations during the late 1990s and 2000s.']], ['Ural State Mining University', ['Ural State Mining University (Russian: Уральский государственный горный университет ) is situated in Yekaterinburg, Russian Federation.', ' It was founded in 1914.', ' In 1917 Nicholas II signed an order titled \"On keeping of the Yekaterinburg Institute of Mines under the patronage of His Majesty the Emperor and on giving to this educational establishment the title of \"The Emperor Nicholas II Ural Institute of Mines\"\".']], ['Nicholas II of Russia', ['Nicholas II or Nikolai II (Russian: Николай II Алекса́ндрович , \"Nikolay II Aleksandrovich\" ; 18 May [O.S. 6 May] 1868 – 17 July 1918) was the last Emperor of Russia, ruling from 1 November 1894 until his forced abdication on 15 March 1917.', ' His reign saw the fall of the Russian Empire from being one of the foremost great powers of the world to economic and military collapse.', ' Due to the Khodynka Tragedy, anti-Semitic pogroms, Bloody Sunday, the violent suppression of the 1905 Revolution, the execution of political opponents and his perceived responsibility for the Russo-Japanese War, he was given the nickname Nicholas the Bloody by his political adversaries.', ' Soviet historiography portrayed Nicholas as a weak and incompetent leader, whose decisions led to military defeats and the deaths of millions of his subjects.']], ['Prince Andrew Romanov', ['Prince Andrew Andreyevich Romanov (born 21 January 1923) is a Russian American artist and author.', \" He is a grand-nephew of Russia's last Emperor, Nicholas II.\", ' Since December 31, 2016 he is a claimant to the headship of the Imperial House of Russia and President of the Romanov Family Association.', ' He is a great-great-grandson in the male-line of Emperor Nicholas I of Russia.']], ['Russian Revolution', ['The Russian Revolution was a pair of revolutions in Russia in 1917 which dismantled the Tsarist autocracy and led to the rise of the Soviet Union.', ' The Russian Empire collapsed with the abdication of Emperor Nicholas II and the old regime was replaced by a provisional government during the first revolution of February 1917 (March in the Gregorian calendar; the older Julian calendar was in use in Russia at the time).', \" Alongside it arose grassroots community assemblies (called 'soviets') which contended for authority.\", ' In the second revolution that October, the Provisional Government was toppled and all power was given to the soviets.']], ['Grand Duke Alexander Alexandrovich of Russia', ['Grand Duke Alexander Alexandrovich of Russia (Russian: Великий Князь Александр Александрович Романов; 7 June 1869 – 2 May 1870) was the infant son of Emperor Alexander III–the heir apparent, styled \"Tsesarevich\", to the Russian throne as the eldest living son of Emperor Alexander II–and his consort, Marie Fyodorovna of Russia.', \" He was Alexander and Marie's second child, second son, and the younger brother of the future Emperor Nicholas II.\", ' He died of meningitis in 1870, one month before his first birthday.', ' \"The doctors maintain he did not suffer, but we suffered terribly to see and hear him,\" his mother wrote to her own mother, Queen Louise of Denmark.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 11 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:21:12.697\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5abba3cd554299642a094aee', 'answer': 'six', 'question': 'How many different schools does the university, in which Andrew J. Elliot is a professor of psychology, have?', 'supporting_facts': [['Andrew J. Elliot', 0], ['University of Rochester', 2]], 'context': [['Archives of Scientific Psychology', ['Archives of Scientific Psychology is an open access academic journal published by the American Psychological Association.', ' The journal publishes a wide variety of articles pertaining to the many different sub-fields of psychology, such as neuroscience and political psychology.', ' The journal includes articles that cover the many different research methodologies employed by psychologists.', ' The current editors-in-chief are Cecil R. Reynolds (Texas A&M University) and Gary R. VandenBos (American Psychological Association).']], ['Neigong', ['Neigong, also spelled \"nei kung\", \"neigung\", or \"nae gong\", refers to any of a set of Chinese breathing, meditation and spiritual practice disciplines associated with Daoism and especially the Chinese martial arts.', ' Neigong practice is normally associated with the so-called \"soft style\", \"internal\" or neijia 內家 Chinese martial arts, as opposed to the category known as waigong 外功 or \"external skill\" which is historically associated with shaolinquan or the so-called \"hard style\", \"external\" or wàijiā 外家 Chinese martial arts.', ' Both have many different schools, disciplines and practices and historically there has been mutual influence between the two and distinguishing precisely between them differs from school to school.']], ['Andrew J. Offutt', ['Andrew Jefferson Offutt (August 16, 1934 – April 30, 2013) was an American science fiction and fantasy author.', ' He wrote as Andrew J. Offutt, A. J. Offutt, and Andy Offutt.', ' His normal byline, andrew j. offutt, has all his name in lower-case letters.', ' He also wrote erotica under seventeen different pseudonyms, principally John Cleve, John Denis, Jeff Morehead, and Turk Winter.', ' He is the father of novelist Chris Offutt and professor Jeff Offutt.']], ['Affix grammar over a finite lattice', ['In linguistics, the affix grammars over a finite lattice (AGFL) formalism is a notation for context-free grammars with finite set-valued features, acceptable to linguists of many different schools.']], ['Hojōjutsu', ['Hojōjutsu (捕縄術), or Torinawajutsu (捕縄術), or just Nawajutsu (縄術), is the traditional Japanese martial art of restraining a person using cord or rope (said \"nawa\" 縄 in Japanese).', ' Encompassing many different materials, techniques and methods from many different schools, Hojōjutsu is a quintessentially Japanese art that is a unique product of Japanese history and culture.']], ['Michael Bayne', ['Michael Bayne is an athletic coach who has led teams in many sports, and in schools all across North and South Carolina.', ' He served as the Head Golf Coach and Special Teams Coordinator at Brevard College from 2006 until 2010, where he then worked as the Head Track, Cross Country and Lacrosse Coach and Special Teams Coordinator for North Greenville University.', ' He grew up in South Carolina, receiving his B.A. Degree from University of South Carolina in 1980.', ' Bayne has provided services as the Head and Assistant Coach for many different schools and team sports, from 1984 to the present.', ' In addition to his coaching career, Michael Bayne has been an upstanding educator and administrator.']], ['Jennifer McFalls', ['Jennifer Yvonne McFalls (born November 10, 1971) is a retired professional softball player who played for Texas A&M and then went on to the U.S. National Softball Team.', ' After her years playing softball McFalls decided to become a coach with her first position as the assistant coach at Texas A&M.', ' Mcfalls continued to coach for many years with several different schools at many different competitive levels.', ' She was the head coach of the National Pro Fastpitch professional softball team, the Dallas Charge for their inaugural season.']], ['University of Rochester', ['The University of Rochester ( U of R or UR) frequently referred to simply as Rochester, is a private, nonsectarian, research university in Rochester, New York.', ' The university grants undergraduate and graduate degrees, including doctoral and professional degrees.', ' The university has six schools and various interdisciplinary programs.']], ['Andrew J. Elliot', ['Andrew J. Elliot (born 1962) is a professor of psychology at the University of Rochester.', ' His research on the hierarchical model of approach and avoidance motivation focuses on combining classic and contemporary methods to test various theories.', \" Elliot's work in social psychology is cited frequently by those in the field, causing him to be named one of Thomson Reuters' ISI Highly Cited for the Social Sciences in 2010.\"]], ['List of yoga schools', ['Yoga, rather than being the name for a singular lineage or even a specific practice, is a bracket term that covers a number of methodologies, each with a number of schools.', ' Within the major branches of yoga such as haṭha, lāya, rāja, jñāna, and bhakti there are many different schools and lineages, both extant and defunct.', ' Since the late 19th century, a great number of distinct new styles of \"Yoga\" have been introduced by individual teachers.', ' There are also a number of schools and traditions that are occasionally referred to as yoga or yogic for their similar practices despite having no foundation in the Indian tradition such as Shin Shin Tōitsu-dō, and Daoyin.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 11 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "\u001b[32m2025-12-07 12:21:23.530\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 17 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 7%|▋ | 34/500 [00:22<11:29, 1.48s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:23.552\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 17 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:23.669\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 12 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 7%|▋ | 36/500 [00:23<08:47, 1.14s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:23.825\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 11 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:23.840\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 17 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 8%|▊ | 38/500 [00:23<06:40, 1.15it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 8%|▊ | 40/500 [00:28<10:08, 1.32s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:21:34.868\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a80b3a9554299485f5986cc', 'answer': 'Fairfax County', 'question': 'Tysons Galleria is located in what county?', 'supporting_facts': [['McLean, Virginia', 0], ['Tysons Galleria', 0]], 'context': [[\"Scott's Run Nature Preserve\", [\"Scott's Run Nature Preserve is a nature preserve in Fairfax County, Virginia, United States.\", ' Located in McLean, it is bordered by Virginia State Route 193 to its south, Interstate 495 to its east and the Potomac River to its north.', \" It encompasses 336 acre of woodland with its namesake, Scott's Run, flowing through its west side.\", \" Scott's Run originates in nearby Tysons Corner and enters the Potomac on the northwest side of the preserve.\", ' The preserve is noted for including eastern hemlocks among its plant life, which are rare for the area.', ' It is a popular destination for recreation and hiking and is operated by the Fairfax County Park Authority.']], ['Tysons Galleria', ['Tysons Galleria is a three-level super-regional mall owned by General Growth Properties located at 2001 International Drive, McLean, Virginia, in Tysons Corner.', ' It is the second-largest mall in McLean/Tysons Corner, and one of the largest in the Washington metropolitan area.']], ['Poughkeepsie Galleria', ['The Poughkeepsie Galleria (locally known as \"The Galleria\") is an upscale shopping center on U.S. 9 in the Town of Poughkeepsie, New York, located just north of Wappingers Falls, and is the largest shopping center in Dutchess County.', ' The Galleria has an area of 1100000 sqft with two floors containing 140 shops and 14 restaurants as well as a 16-screen, stadium-seating Regal Cinemas theater.']], ['Tysons, Virginia', ['Tysons, or formerly “Tysons Corner” is a census-designated place (CDP) and unincorporated community in Fairfax County, Virginia, United States.', ' Located in Northern Virginia between the community of McLean and the town of Vienna along the Capital Beltway (I-495), it lies within the Washington Metropolitan Area.', ' Tysons is home to two super-regional shopping malls—Tysons Corner Center and Tysons Galleria—and the corporate headquarters of numerous companies such as Intelsat, Gannett, Hilton Worldwide, Freddie Mac, Capital One and Booz Allen Hamilton.', \" Tysons is Fairfax County's central business district and a regional commercial center.\", ' It has been characterized as a quintessential example of an edge city.', ' The population was 19,627 as of the 2010 census.']], ['Tysons Corner Center', ['Tysons Corner Center, located in the Tysons Corner unincorporated area in Fairfax County, Virginia, United States (between McLean and Vienna, Virginia), opened to the public in 1968, becoming one of the first fully enclosed, climate-controlled shopping malls in the Washington metropolitan area.']], ['Cobb Galleria Centre', ['The Cobb Galleria Centre is a meeting and convention center and a shopping center in the Cumberland/Galleria district of Cobb County, northwest of Atlanta, Georgia, in the United States.', ' It is also located next to a cluster of mid-rise office buildings, the Cumberland Mall and the Cobb Energy Performing Arts Centre.', ' It has hosted over 15,000 events and millions of guests.', ' The venue operates under the direction of the Cobb-Marietta Coliseum and Exhibit Hall Authority and is located at the intersection of three major highways: Interstate 75, Interstate 285, and Cobb Parkway (U.S. 41) just northwest of the city.', ' The Galleria Specialty Mall, which pre-dates the convention center, is located downstairs, with meeting halls upstairs.']], ['West McLean, Virginia', ['West McLean is an unincorporated community in Fairfax County, Virginia, United States.', ' West McLean is located in the western part of the McLean census-designated place and includes much of the Tysons Corner area.', ' West McLean has its own post office which has ZIP code 22103, which is used primarily for the PO Boxes at that Post Office.', ' Other than the Post Office itself, West McLean uses ZIP code 22102.']], ['McLean station', ['McLean (preliminary name Tysons East, Tysons–McLean) is a Washington Metro station in Fairfax County, Virginia, on the Silver Line.', ' The station is located in Tysons Corner, with a McLean postal address.', ' It began operation on July 26, 2014.']], ['McLean, Virginia', ['McLean ( ) is a census-designated place (CDP) in Fairfax County in Northern Virginia.', ' McLean is home to many diplomats, businessmen, members of Congress, and high-ranking government officials partially due to its proximity to Washington, D.C. and the Central Intelligence Agency.', ' It is the location of Hickory Hill, the former home of Ethel Kennedy, the widow of Robert F. Kennedy.', ' It is also the location of Salona, the former home of Light-Horse Harry Lee, the Revolutionary War hero.', ' The community had an estimated total population of 53,673 in 2015, according to estimates prepared by the United States Census Bureau.', ' It is located between the Potomac River and the town of Vienna.', ' McLean is known for its luxury homes and its high-end shopping destinations: the Tysons Corner Center and the Tysons Galleria.', ' The two McLean zip codes - 22101 and 22102 - are among the most expensive ZIP Codes in Virginia and the United States.']], ['Spring Hill station', ['Spring Hill (preliminary names Tysons West, Tysons–Spring Hill Road) is a Washington Metro station in Fairfax County, Virginia, on the Silver Line.', ' Located in Tysons Corner, it began operation on July 26, 2014.', ' The station is located in the central median of Leesburg Pike (SR 7) just west of Spring Hill Road.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 8%|▊ | 41/500 [00:34<15:45, 2.06s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:21:35.506\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 5 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 8%|▊ | 42/500 [00:34<13:41, 1.79s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:35.513\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 5 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:40.408\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 7 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 9%|▉ | 44/500 [00:39<15:22, 2.02s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:40.544\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 9%|▉ | 45/500 [00:40<12:29, 1.65s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:40.716\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 9%|▉ | 46/500 [00:40<09:59, 1.32s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:40.741\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:40.811\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "\u001b[32m2025-12-07 12:21:40.812\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:40.938\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 10%|█ | 50/500 [00:40<04:36, 1.63it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:40.997\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:41.025\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:41.053\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 11%|█ | 53/500 [00:40<02:57, 2.51it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:21:46.648\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 7 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "\u001b[32m2025-12-07 12:21:46.652\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 11%|█ | 55/500 [00:46<07:38, 1.03s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:46.703\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 7 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:46.818\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 12%|█▏ | 58/500 [00:46<04:59, 1.47it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:46.902\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:46.962\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 12%|█▏ | 60/500 [00:46<03:50, 1.91it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:46.998\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:47.053\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:47.084\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 13%|█▎ | 63/500 [00:46<02:34, 2.84it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:52.841\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 13%|█▎ | 65/500 [00:52<07:10, 1.01it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:52.864\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:52.929\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:52.940\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:53.027\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 14%|█▍ | 69/500 [00:52<04:16, 1.68it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:53.061\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:53.133\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 14%|█▍ | 71/500 [00:52<03:21, 2.13it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:53.146\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:21:53.189\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 15%|█▌ | 76/500 [00:54<03:04, 2.30it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 16%|█▌ | 78/500 [00:55<02:56, 2.39it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.8571428571428571, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 17%|█▋ | 83/500 [00:55<01:39, 4.19it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 17%|█▋ | 85/500 [00:56<01:20, 5.12it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 17%|█▋ | 87/500 [00:56<01:10, 5.90it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 18%|█▊ | 89/500 [00:56<01:20, 5.08it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 19%|█▊ | 93/500 [00:57<00:53, 7.56it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 19%|█▉ | 96/500 [00:57<00:45, 8.91it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 20%|█▉ | 99/500 [00:57<00:39, 10.23it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.08, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 20%|██ | 101/500 [00:57<00:36, 10.98it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 21%|██ | 104/500 [00:57<00:34, 11.54it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 22%|██▏ | 108/500 [00:58<00:33, 11.87it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.9090909090909091, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.19999999999999998, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 22%|██▏ | 110/500 [00:58<00:31, 12.50it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 23%|██▎ | 114/500 [00:59<00:41, 9.39it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.1111111111111111, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 23%|██▎ | 116/500 [00:59<00:45, 8.51it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 24%|██▍ | 119/500 [01:00<01:05, 5.80it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.19999999999999998, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 24%|██▍ | 121/500 [01:00<01:23, 4.52it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 25%|██▍ | 123/500 [01:00<01:02, 6.05it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.4615384615384615, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 25%|██▌ | 125/500 [01:01<01:07, 5.54it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 27%|██▋ | 133/500 [01:01<00:22, 16.00it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 27%|██▋ | 136/500 [01:01<00:29, 12.43it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.7777777777777778, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 28%|██▊ | 140/500 [01:02<00:32, 10.98it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 28%|██▊ | 142/500 [01:07<04:03, 1.47it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 29%|██▊ | 143/500 [01:07<03:37, 1.64it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 29%|██▉ | 146/500 [01:07<02:12, 2.68it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:22:08.426\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 15 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 29%|██▉ | 147/500 [01:07<01:59, 2.95it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:22:08.499\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a881cbb55429938390d3ee7', 'answer': 'Hellenism', 'question': 'St. John of the Cross Episcopal Church has a rectory that was in a style that was a product of what earlier style?', 'supporting_facts': [['St. John of the Cross Episcopal Church', 3], ['Greek Revival architecture', 1]], 'context': [[\"St. John's Episcopal Church, Canandaigua\", ['The Episcopal presence in Canandaigua,New York begins about 1799 with the St. Matthew Society, a missionary group.', \" St. John's Episcopal Church was organized in 1814 and first met in the Ontario County, New York Court House in Canandaigua.\", \" St. John's erected a wooden church building in 1816.\", ' Bishop John Henry Hobart consecrated it that year.', ' The brick rectory was constructed alongside the church in 1851.', ' Steady congregational growth necessitated more space.', ' This led to the razing of the first church and the constructing of the current, larger stone Gothic building done in the parish church style popular in the 19th century.', ' Emlen T. Littel of New York, who was also the architect of Zion Episcopal Church (Palmyra, New York) designed the building.', ' This church was constructed in 1872 at a cost of $47,000 (approximately $850,000 today) and consecrated in 1886.', ' It contains several windows from the earlier wooden church, elaborate new stained glass windows imported from Europe, and one—The Parables Window—was designed by Daniel Cottier(1837–1891), who was considered an important influence on Louis Comfort Tiffany.', ' In 1908, new hardwood floors, choir stalls, and an organ were installed.', ' The parish house and a chapel were added at the same time.', ' In 1964-65, an addition to parish house included classrooms, a new chapel, and a dining/ meeting room added to celebrate the church’s 150th anniversary.', ' Recent additions include a columbarium with a capacity of 136 niches and a memorial garden, outside the church, reached through the columbarium and chapel in the south transept.']], ['Episcopal Diocese of the Virgin Islands', ['The Episcopal Diocese of the Virgin Islands is a diocese of the Episcopal Church in the United States of America (ECUSA) which includes both the United States Virgin Islands and the British Virgin Islands.', ' The diocese is a part of Province II of the Episcopal Church.', ' The current Diocesan Bishop of the Virgin Islands is the Edward Ambrose Gumbs.', ' The cathedral church of the diocese is the Cathedral Church of All Saints, Charlotte Amalie.', ' The diocese currently comprises 14 churches.', \" There is a functioning parish school on St. Thomas All Saints Cathedral School there was an academic campus on St. Croix, St. Dunstan's Episcopal High School.\", \" St. Dunstan's closed in the 1990s.\", \" There is also the St. Georges School located on the parish property of St. Georges Episcopal Church in Road Town, Tortola in the British Virgin Islands, which also opened the St. Georges School (Secondary Division) in Palestina Estate near to the St. Paul's Episcopal Church in Sea Cow's Bay, Tortola in the British Virgin Islands.\", \" There is also the St. Mary's School located on the parish property of the St. Mary's Episcopal Church in Valley, Virgin Gorda in the British Virgin Islands.\"]], [\"St. Luke's Church and Cemetery\", [\"St. Luke's Episcopal Church and Cemetery is a historic Episcopal church complex, cemetery, and national historic district located at 303-321 N. Cedar Street, 322 E. McBee Street in Lincolnton, Lincoln County, North Carolina.\", ' The complex includes the church, parish hall, and rectory.', ' The church was built in 1885-1886, and is a Late Gothic Revival style frame structure with a brick veneer added in 1922-1923.', ' The tower is believed to date to 1859.', ' The parish hall was built in 1907, and is a one-story, rectangular frame building.', ' The rectory was built in 1911-1912, and is a two-story, \"T\"-form Colonial Revival style dwelling with a pebbledash finish.', ' The cemetery includes approximately 300 gravestones, with the earliest dating to 1854.']], ['Mountain Road Historic District', ['Mountain Road Historic District is a national historic district in Halifax, Halifax County, Virginia.', ' The district includes 22 contributing buildings located along Mountain Road (State Route 360) and consists of two churches, a parish hall, a masonic hall, and a host of private residences dating to the 19th and early 20th centuries.', \" Notable buildings include the Masonic Lodge (1828), Methodist Church (1831), St. John's Episcopal Church (1844), Magnolia Hill, Grand Oaks, and St. John's Rectory.\", \" Several of the earlier dwellings and St. John's Episcopal Church were designed by Dabney Cosby, Jr., son of the Jeffersonian workman, Dabney Cosby, Sr.\"]], ['St. John of the Cross Episcopal Church', ['St. John of the Cross Episcopal Church, Rectory and Cemetery is a historic Episcopal church complex located at Bristol, Elkhart County, Indiana.', ' The church was built between 1843 and 1847, and is a one-story, Gothic Revival style frame building.', ' It has a projecting bell tower with octagonal roof and lancet windows.', ' The associated rectory was built in 1830, and is a 1 1/2-story, rectangular, Greek Revival style frame dwelling.', ' The complex also includes the contributing church cemetery.']], [\"St. Augustine's Episcopal Church Complex\", [\"St. Augustine's Episcopal Church Complex is a historic Episcopal church complex at 6 Old Post Road north of Croton-on-Hudson, Westchester County, New York.\", ' The complex consists of the church and rectory The church consists of the original building and a later parish hall connected by an enclosed hyphen.', ' The church was built in 1857, the parish hall was added in 1882, and the rectory was completed in 1910.', ' The church and parish hall are in the Gothic Revival style, while the rectory is in the Colonial Revival style.']], ['St. Barnabas Episcopal Church (Troy, New York)', ['St. Barnabas Episcopal Church, later called Christ & St. Barnabas Episcopal Church, and now known as New Hope Missionary Baptist Church, is an historic Episcopal church and rectory at 2900 Fifth Avenue in Troy, Rensselaer County, New York.', ' The church was built in 1895 and is a red brick church in the Late Gothic Revival style.', ' It has a gable roof and three hipped dormers.', ' It has an open bell tower and slender conical turrets.', ' It features a rose window depicting the Madonna and Child.', ' The former rectory is a 2\\xa0⁄ -story, L-shaped brick residence.', ' Also on the property is a contributing carved stone crucifixion dated to about 1900.']], ['Greek Revival architecture', ['The Greek Revival was an architectural movement of the late 18th and early 19th centuries, predominantly in Northern Europe and the United States.', ' A product of Hellenism, it may be looked upon as the last phase in the development of Neoclassical architecture.', ' The term was first used by Charles Robert Cockerell in a lecture he gave as Professor of Architecture to the Royal Academy of Arts, London in 1842.']], [\"St. Peter's Episcopal Church and Rectory\", [\"St. Peter's Episcopal Church and Rectory is a historic Episcopal church and rectory at 36-38 W. Campbell Street in Blairsville, Indiana County, Pennsylvania.\", ' The church was built in 1830, and is a small, rectangular brick building on a stone foundation in an Early Gothic Revival style.', ' It features a belfry atop the front entrance gable roof.', ' The rectory was built in 1889, and is a 2\\xa01/2-story, wood frame building with Eastlake Movement elements.']], ['Old Rectory (Perrowville, Virginia)', [\"Old Rectory of St. Stephen's Episcopal Church is a historic Episcopal church rectory located near Perrowville, Bedford County, Virginia.\", ' It was built in 1787, and is a \"T\"-shaped frame dwelling with exterior end chimneys and a gable roof.', ' It features a modern one bay, two-story portico supported by four fluted Doric order columns.', \" From around 1828 to 1904, the house served as the rectory of St. Stephen's Episcopal Church.\"]]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 15 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:22:08.563\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5addda9b5542992200553b5b', 'answer': 'Isla de Xativa', 'question': 'What was the island, on which Marinelli Glacier is located, formerly known as?', 'supporting_facts': [['Marinelli Glacier', 0], ['Isla Grande de Tierra del Fuego', 0]], 'context': [['Sherburne Ranger Station Historic District', ['The Sherburne Ranger Station in Glacier National Park is an example of the National Park Service Rustic style.', ' Located in the Swiftcurrent portion of the park, it was built in 1926.', ' It is part of a small historic district that includes a mess hall and subsidiary structures, formerly known as the Sherburne Road Camp, established in 1931.', ' The ranger station closely resembles the ranger stations at Belly River and Lake McDonald.', ' A checking station at the road remains substantially intact.']], ['Rising Sun Auto Camp', ['The Rising Sun Auto Camp, also known as the Roes Creek Auto Camp, East Glacier Auto Camp or simply Rising Sun preserves a portion of the built-up area of Glacier National Park that documents the second phase of tourist development in the park.', ' Rising Sun is located along the Going-to-the-Sun Road, approximately 7 mi from the east entrance to Glacier National Park, Montana, United States.', \" Rising Sun is a wayside area that has a National Park Service campground, a camp store and gift shop, picnic area, restaurant, as well as a motel and guest cabins which are managed by the park's concessionaire, Xanterra Parks & Resorts.\", ' In the immediate area, there is also a boat dock as well as sightseeing boats which allow visitors to tour Saint Mary Lake, the second largest lake in the park.', ' \"The most popular spot for [Glacier] tourists is Rising Sun, an overlook of Goose Island in St. Mary Lake and one of the most photographed spots in the park.\"']], ['Wanshan Archipelago Campaign', ['The Wanshan Archipelago Campaign (万山群岛战役) was a campaign fought between the communist and the nationalist forces during the Chinese Civil War for the control of Wanshan Archipelago (\"Wanshan Qundao\", 万山群岛), and resulted in communist victory.', ' The archipelago consists of 48 islands strategically located at the mouth of the Pearl River, a chokepoint on the communication lines to Hong Kong and Macau.', ' The largest island is the Laurel Mountain (Guishan, 桂山) Island, which was formerly known as Trash Tail (Lajiwei, 垃圾尾) Island.', \" Other major islands include Outer Linding (Wailinding, 外伶仃) Island, Dong'ao (东澳) Island, Tri-gate (Sanmen, 三门) Island, Greater Ten-thousand Mountain (Da Wanshan, 大万山) Island, Lesser Ten-thousand Mountain (Xiao Wanshan, 小万山) Island, Burden Pole (Dangan, 担杆) Islands, and Jianpeng (佳蓬) Islands.\"]], ['Birnie Island', ['Birnie Island is a small, uninhabited coral island, 20 hectares in area, part of the Phoenix Island group, that is part of the Republic of Kiribati.', ' It is located about 100\\xa0km SE of Kanton Island and 90\\xa0km WNW of Rawaki Island, formerly known as Phoenix Island.', ' It lies at .', ' Birnie island measures only 1.2\\xa0km long and 0.5\\xa0km wide.', ' There is no anchorage, but landing can be made on the lee beach.']], ['Marinelli Glacier', ['Marinelli Glacier is a tidewater glacier located in Alberto de Agostini National Park, Isla Grande de Tierra del Fuego.', ' The glacier spills out from the backbone of the Cordillera Darwin and calves into Ainsworth Bay, an embayment of the Almirantazgo Fjord.', ' The Marinelli Glacier is in a state of retreat, beginning at least as early as 1960 and continuing to the present time.']], ['Boaz Island, Bermuda', ['Boaz Island, formerly known as \"Gate\\'s Island\" or \"Yates Island\", is one of the six main islands of Bermuda.', ' It is part of a chain of islands in the west of the country that make up Sandys Parish, lying between the larger Ireland Island and Somerset Island, and is connected to both by bridges.', ' Its east coast forms part of the edge of the Great Sound.', ' Boaz Island was part of the Royal Naval base, which included the HM Dockyard on Ireland Island.', ' From 1939, Boaz Island was used as a Royal Naval Air Station.', ' Its primary role was the servicing, repair and replacement of spotter floatplanes and flying boats belonging to naval vessels.', \" Early in the Second World War, with no other units to fill the role, aeroplanes from Boaz Island were used to maintain anti-submarine air patrols, using whatever aircrew were on hand, including pilots from the Bermuda Flying School on Darrell's Island.\", ' All that remains of the Fleet Air Arm facility today is a hangar on runway road, and two slips.']], ['Sea Pines Resort', ['The Sea Pines Resort or Sea Pines is located in Sea Pines Plantation, a 5,200-acre private residential gated community located on the southern tip of the island which comprises the town of Hilton Head Island, South Carolina.', ' Sea Pines is home to four golf courses, including Harbour Town Golf Links, Atlantic Dunes by Davis Love III, (formerly known as the Ocean Course), the Heron Point golf course (formerly known as the Sea Marsh course) and the Sea Pines Country Club Course.', ' The RBC Heritage is a PGA Tour event held annually in April at the Harbour Town course.']], ['Isla Grande de Tierra del Fuego', ['Tierra del Fuego—literally \"Land of the Fire\", formerly \"Isla de Xativa\" and also known as Isla Grande de Tierra del Fuego—is an island near the southern tip of South America from which it is separated by the Strait of Magellan.', ' The western portion (61.43%) of the island (29,484.7 km2 ) is in Chile (Province of Tierra del Fuego and Antártica Chilena Province), while the eastern portion (38.57%, 18,507.3 km2 ) is in Argentina (Tierra del Fuego Province).', ' It forms the major landmass in an extended group of islands or archipelago also known as Tierra del Fuego.']], ['Douglas River', [\"The Douglas River, formerly known as the Twain, is a river of the West Coast of New Zealand's South Island.\", ' Its source is high in the Southern Alps, five kilometres south of Mount Sefton, and its upper reaches are fed by water from the Douglas Glacier.', ' It flows west for 18 kilometres, joined by runoff from the Horace Walker Glacier, before joining the waters of the Karangarua River.', \" The Douglas River's entire course is within Westland Tai Poutini National Park.\", ' The river and glacier are named after Charles Edward Douglas, a 19th-century explorer and mountaineer.']], ['Lian Island', ['Lian Island () is the largest island in Lianyungang, Jiangsu, China.', ' The island is located inside Haizhou Bay in the Yellow Sea.', ' It is 9 km long from east to west across the island and it has an area of 7.57 km2 .', ' 80% of the island is covered with forests.', ' The longest sea dyke nationally (6.7 km long) connects the island with the east of the city of Lianyungang.', ' Lian Island is the only AAAA-class seashore tourist attraction in Jiangsu.', ' The island was formerly known as Yingyou hill.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 15 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 30%|██▉ | 149/500 [01:08<01:24, 4.14it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:22:12.717\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 11 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 30%|███ | 150/500 [01:19<01:24, 4.14it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:22:23.010\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a728b1c5542992359bc30e0', 'answer': 'Jonghyun', 'question': 'Which member of the boy group Shinee released their first studio album \"She is\"?', 'supporting_facts': [['She Is', 0], ['Kim Jong-hyun (singer)', 1]], 'context': [['Romeo (EP)', ['Romeo (Korean: 로미오 ) is the second EP of South Korean boy group Shinee.', ' It was released on May 25, 2009 in South Korea under the seal of the label S.M. Entertainment.', ' The EP consists of six tracks including the title song \"Juliette\" and is Shinee\\'s first Korean release after nine months hiatus.', ' On August 29, 2011 a Japanese version of \"Juliette\" was released as Shinee\\'s second Japanese single with the original Japanese song \"Kiss Kiss Kiss\" as a B-side.', ' The release peaked at #3 on the weekly Oricon chart.']], ['List of awards and nominations received by Shinee', ['South Korean boy group Shinee have received several awards and nominations for their music work.', ' The group was formed by S.M. Entertainment in 2008 and released their first full-length album, \"The Shinee World\", on August 28, 2008, which won the Newcomer Album of the Year at the 23rd Golden Disk Awards.', ' The first single released from the album was \"Sanso Gateun Neo (Love Like Oxygen)\" and won first place on \"M Countdown\" on September 18, 2008 making it the group\\'s first win on Korean music shows since debut.', ' Their second album \"Lucifer\" (2010) produced two singles, \"Lucifer\" and \"Hello\".', ' For their outstanding choreography the group was nominated for the Best Dance Performance Award at the Mnet Asian Music Awards in 2010. \"', 'Lucifer\" also won the Disk Bonsang Award at the 25th Golden Disk Awards as well as the Popularity Award.', ' On March 21, 2012 the group released their fourth EP \"Sherlock\" for which the group was awarded another Disk Bonsang Award at the 27th Golden Disc Awards and the Bonsang Award at the 22nd Seoul Music Award.', ' Also following the success of the lead single it was also nominated for Song of the Year at the 2012 Mnet Asian Music Awards.']], ['1 of 1 (album)', ['1 of 1 is the fifth Korean studio album and the eighth overall by South Korean boy band Shinee.', ' It was released digitally and physically on October 5, 2016, under S.M. Entertainment and distributed by KT Music.', ' The album contains nine songs, including the title track of the same name, \"1 of 1\".', ' Musically, the album is a modernized twist on the retro genre, and stretches back to the 1980–1990 period.', \" Additionally, based on their '90s theme, the group released a limited edition of cassette tapes besides the usual CD version.\", ' In order to promote the album, Shinee appeared on several South Korean music programs, such as \"Music Bank\", \"Show!', ' Music Core\", and \"Inkigayo\", where they performed material from the album.', ' On November 15, 2016, Shinee released a repackaged version of their fifth studio album titled \"1 and 1\" with 5 new songs, including the title track \"Tell Me What to Do\".']], ['Kim Jong-hyun (singer)', ['Kim Jong-hyun (born April 8, 1990), better known by the mononym Jonghyun, is a South Korean singer-songwriter, and radio host.', \" He is a vocalist of the South Korean boy group Shinee, and has further participated in S.M. Entertainment's project group S.M.\", ' The Ballad.', ' Jonghyun debuted as a solo artist on January 12, 2015, with his first EP, titled \"Base\".', ' In the same year, on September 17, Jonghyun released a compilation album, \"Story Op.1\".', ' On May 24, 2016, Jonghyun released his first studio album, \"She Is\", followed by his second compilation album, \"Story Op.2\" on April 24, 2017.']], ['She Is', ['She Is (Hangul: 좋아 ; RR: \"Joh ah \" \"good\") is the first studio album by South Korean singer-songwriter Jonghyun, released on May 24, 2016 by S.M. Entertainment and distributed by KT Music.']], ['Shinee World 2012', ['Shinee World 2012 (promoted as THE FIRST JAPAN ARENA TOUR \"SHINee WORLD 2012\") is the first Japan nationwide concert tour by South Korean boy group Shinee to support their first Japanese studio album, \"The First\".', ' The tour kicked off in Fukuoka on April 25, 2012 and ended in Hiroshima on July 1, 2012 with a total of 20 concerts in 7 cities.']], ['List of songs written by Kim Jong-hyun', ['Kim Jong-hyun (most often credited as Jonghyun), is a South Korean singer-songwriter and producer.', ' He began his musical career in 2008 as a member of the group Shinee and later formed the ballad group S.M.', ' The Ballad.', ' Jonghyun debuted as a composer happened to write Korean lyrics for the Shinees promotional single \"Juliette\", which was featured in the mini-album \"Romeo\", released in May 2009.', ' Participate in the writing of three songs on Shinee\\'s second Korean studio album, \"Lucifer\", the first, \"Up & Down\", was co-written with Misfit with the rap being written by Minho, the second, \"Obsession\", was completely written by Jonghyun with Minho once again working on his own rap, and the third \"Shout Out\" co-written by all members of Shinee, JQ and Misfit.', ' In 2012, Jonghyun co-wrote the lyrics to the song with \"Alarm Clock\" with Minho, a song about wishing to wake up from the nightmare of a past break up, and wrote the lyrics to \"Honesty\" which was described as a song written for the fans who had stayed by their side with unchanging love until that point.', ' Both songs were featured on the mini album, \"Sherlock\".']], ['Sherlock (EP)', ['\"Sherlock\" is the fourth EP of South Korean boy group Shinee.', ' The EP consists of seven tracks including the title song \"Sherlock (Clue + Note)\" a hybrid remix of the two songs.', ' It was released on March 21, 2012, in South Korea under the seal of the label S.M. Entertainment and distributed by KT Music.', ' The album was made available online worldwide on March 19, 2012.', \" The EP is Shinee's first Korean release after a year and 6 months hiatus.\"]], ['Shinee World 2013', ['Shinee World 2013 (promoted as JAPAN ARENA TOUR SHINee WORLD 2013 ~Boys Meet U~) is the second Japan nationwide concert tour by South Korean boy group Shinee to promote their second Japanese studio album, \"Boys Meet U\".', ' The tour kicked off in Saitama on June 28, 2013 and ended in Nagoya on December 11, 2013 with a total of 15 concerts in 9 cities.']], ['The First (album)', ['\"The First\" is the first Japanese studio album by South Korean boy group Shinee.', ' The album was scheduled for release on November 23, 2011, however it was delayed to December 7, 2011 in Japan under EMI Music Japan.', ' The album features three previously released singles, \"Replay\", \"Juliette\" and \"Lucifer\", all of which have ranked within the top three on Oricon charts.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 30%|███ | 151/500 [01:22<15:12, 2.61s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:22:23.079\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 18 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:22:23.091\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 24 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 31%|███ | 154/500 [01:22<08:55, 1.55s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:22:23.282\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a8eea4a5542990e94052bb7', 'answer': 'between the 8th and 16th centuries', 'question': 'When was the Western Germanic language spoken from which the small settlement situated on the river Leda opposite Leer derives its name?', 'supporting_facts': [['Leda (river)', 3], ['Old Frisian', 0]], 'context': [['German language', ['German (\"Deutsch\" ] ) is a West Germanic language that is mainly spoken in Central Europe.', ' It is the most widely spoken and (co-) official language in Germany, Austria, Switzerland, South Tyrol (Italy), the German-speaking Community of Belgium, and Liechtenstein.', ' It is also one of the three official languages of Luxembourg.', ' The languages which are most similar to German are the other members of the West Germanic language branch: Afrikaans, Dutch, English, the Frisian languages, Low German/Low Saxon, Luxembourgish, and Yiddish.', ' German is the second most widely spoken Germanic language, after English.']], ['English language', ['English is a West Germanic language that was first spoken in early medieval England and is now a global \"lingua franca\".', ' Named after the Angles, one of the Germanic tribes that migrated to England, it ultimately derives its name from the Anglia (Angeln) peninsula in the Baltic Sea.', ' It is closely related to the Frisian languages, but its vocabulary has been significantly influenced by other Germanic languages, particularly Norse (a North Germanic language), as well as by Latin and Romance languages, particularly French.']], ['Proto-Norse language', ['Proto-Norse (also called Proto-Scandinavian, Primitive Norse, Proto-Nordic, Ancient Nordic, Ancient Scandinavian, Old Nordic, Old Scandinavian, Proto-North Germanic, North Proto-Germanic or Common Scandinavian) was an Indo-European language spoken in Scandinavia that is thought to have evolved as a northern dialect of Proto-Germanic in the first centuries CE.', ' It is the earliest stage of a characteristically North Germanic language, and the language attested in the oldest Scandinavian Elder Futhark inscriptions, spoken around from the 2nd to 8th centuries (corresponding to the late Roman Iron Age and the Germanic Iron Age).', ' It evolved into the dialects of Old Norse at the beginning of the Viking Age in about 800, which later themselves evolved into modern North Germanic languages.']], ['Dutch language', ['Dutch (\\xa0\\xa0 ) is a West Germanic language that is spoken by around 24 million people as a first language—including the population of the Netherlands and about sixty percent that of Belgium—and by another 5 million as a second language.', ' It is the third most widely spoken Germanic language, after English and German.']], ['Old Frisian', ['Old Frisian is a West Germanic language spoken between the 8th and 16th centuries in the area between the Rhine and Weser on the European North Sea coast.', \" The Frisian settlers on the coast of South Jutland (today's Northern Friesland) also spoke Old Frisian but no medieval texts of this area are known.\", ' The language of the earlier inhabitants of the region between the Zuiderzee and Ems River (the Frisians mentioned by Tacitus) is attested in only a few personal names and place-names.', ' Old Frisian evolved into Middle Frisian, spoken from the 16th to the 19th century.']], ['Afrikaans', ['Afrikaans ( ) is a West Germanic language spoken in South Africa, Namibia and, to a lesser extent, Botswana and Zimbabwe.', ' It evolved from the Dutch vernacular of South Holland (Hollandic dialect) spoken by the mainly Dutch settlers of what is now South Africa, where it gradually began to develop distinguishing characteristics in the course of the 18th century.', ' Hence, it is a daughter language of Dutch, and was previously referred to as \"Cape Dutch\" (a term also used to refer collectively to the early Cape settlers) or \"kitchen Dutch\" (a derogatory term used to refer to Afrikaans in its earlier days).', ' However, it is also variously described as a creole or as a partially creolised language.', ' The term is ultimately derived from Dutch \"Afrikaans-Hollands \" meaning \"African Dutch\".', ' It is the first language of most of the Afrikaners and Coloureds of Southern Africa.']], ['Leer', ['Leer is a town in the district of Leer, the northwestern part of Lower Saxony, Germany.', ' It is situated on the river Leda, a tributary of the river Ems, near the border with the Netherlands.']], ['Suorva', ['Suorva or Suorvadammen (the Suorva Dam) is a small settlement situated at the southern parts of Akkajaure, in Stora Sjöfallet National Park, Sweden.', ' The settlement can be reached by car (and bus, from Gällivare).', ' It consists of a few houses and a dam operated by Vattenfall, which regulates the flow to the hydroelectric plant in Vietas located about 5 kilometers downstream.', ' The road over the dam is normally open for hikers (not cars) and makes for a possible route into the northern parts of Sarek National Park which does not require using a boat.']], ['Old Saxon', ['Old Saxon, also known as Old Low German, was a Germanic language and the earliest recorded form of Low German (spoken nowadays in Northern Germany, the northeastern Netherlands, southern Denmark, the Americas and parts of Eastern Europe).', ' It is a West Germanic language, closely related to the Anglo-Frisian languages.', ' It has been documented from the 8th century until the 12th century, when it gradually evolved into Middle Low German.', ' It was spoken throughout modern northwestern Germany, primarily in the coastal regions and in the eastern Netherlands by Saxons, a Germanic tribe who inhabited the region of Saxony.', \" It partially shares Anglo-Frisian's (Old Frisian, Old English) Ingvaeonic nasal spirant law which sets it apart from Low Franconian and Irminonic languages, such as Dutch, Luxembourgish and German.\"]], ['Leda (river)', ['The Leda is a river in north-western Germany in the state of Lower Saxony.', ' It is a right tributary of the Ems and originates at the confluence of the Sagter Ems and the Soeste (Dreyschloot) near the town of Barßel.', ' The Leda flows into the Ems near the town of Leer.', ' On the southern bank of the Leda, in the \"Overledingen Land\" (Overledingen=\"country over the Leda\"), opposite Leer, lies the small settlement of Kloster Muhde (\"Muhde\" from the Old Frisian \"mutha\" meaning \"(river) mouth\").', ' The total length of the river is 29 km , of which the lower 1.9 km are navigable for sea-going vessels.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 18 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:22:23.619\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 18 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 31%|███▏ | 157/500 [01:23<05:46, 1.01s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:22:23.634\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 18 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 32%|███▏ | 159/500 [01:24<05:23, 1.05it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:22:40.501\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a88a19a5542997e5c09a64b', 'answer': 'moth', 'question': 'The American Sweetgum is the hostplant of what kind of bug?', 'supporting_facts': [['Phyllocnistis liquidambarisella', 0], ['Phyllocnistis liquidambarisella', 1], ['Liquidambar styraciflua', 0]], 'context': [['The Love Bug (1997 film)', ['The Love Bug is a 1997 American made-for-television comedy adventure film and a sequel/remake of the 1968 film of the same name produced by Walt Disney Television which premiered on ABC as part of \"The Wonderful World of Disney\" on November 30, 1997.', ' The remake starred Bruce Campbell and included a special appearance by Dean Jones, star of the original \"The Love Bug\", tying it to the previous films and introduced an evil black Volkswagen named Horace, \"The Hate Bug\", giving the film a much darker tone than the other \"Herbie\" films.']], ['Melacoryphus lateralis', ['Melacoryphus lateralis is a species of true bug, one of several called black-and-red seed bug.', ' Black and fringed with red and gray, some call it the charcoal seed bug, due to its resemblance to a dying ember.', ' Native to the deserts of western North American, they have a tendency to appear in large numbers in the late summer.']], ['Manahawkin Wildlife Management Area', ['Manahawkin Wildlife Management Area (Manahawkin Bottomland Hardwood Forest) is a 1642 acre wildlife management area near Manahawkin, Stafford Township, Ocean County, New Jersey.', ' It was designated a National Natural Landmark in January 1976.', ' It is known for its mature bottomland hardwood forest which contains examples of American sweetgum, red maple and black gum trees.']], ['Liquidambar styraciflua', ['American sweetgum (\"Liquidambar styraciflua\"), also known as American storax, hazel pine, bilsted, redgum, satin-walnut, star-leaved gum, alligatorwood, or simply sweetgum, is a deciduous tree in the genus \"Liquidambar\" native to warm temperate areas of eastern North America and tropical montane regions of Mexico and Central America.', ' Sweet gum is one of the main valuable forest trees in the southeastern United States, and is a popular ornamental tree in temperate climates.', ' It is recognizable by the combination of its five-pointed star-shaped leaves and its hard, spiked fruits.', ' It is currently classified in the plant family Altingiaceae, but was formerly considered a member of the Hamamelidaceae.']], ['Ischnodemus sabuleti', ['Ischnodemus sabuleti, also known as the European chinch bug, is a species of swarming true bug from the family Blissidae, which family also includes the American Chinch Bug \"Blissus leucopterus\".', ' It was first described by Carl Fredrik Fallén in 1826.']], ['Stenodema laevigatum', ['Stenodema laevigatum, or sometimes Stenodema laevigata (also called Grass bug), is a carnivorous species of bug from Miridae family.', ' The species have a gray to brown elongated body, with the eyes located backwards in the head.', ' Sometimes they might come in green colour.', ' They are 8 - in length, which makes it a rather big species of its kind.', ' They are common in the United Kingdom, and throughout the rest of Europe.']], ['Datronia scutellata', ['Datronia scutellata is a plant pathogen that causes wood rot on \"Liquidambar\" (sweetgum) and \"Platanus occidentalis\" (American sycamore) trees.']], ['Green shield bug', ['The green shield bug (\"Palomena prasina\") is a shield bug of the family Pentatomidae.', ' It may also be referred to as a green stink bug, particularly outside of Britain, although the name green stink bug more appropriately belongs to the larger North American stink bug, \"Acrosternum hilare\".', ' The adult green shield bug ranges in the colour of their backs from bright green to bronze, without any substantial markings.', ' Green shield bugs are a very common shield bug throughout Europe, including the British Isles, and are found in a large variety of habitats, including gardens.', ' They have been found as far north as 63° N latitude.']], ['Phyllocnistis liquidambarisella', ['Phyllocnistis liquidambarisella is a moth of the Gracillariidae family, known from the United States (New York, Maryland, Kentucky, Georgia, Texas, Florida).', ' The hostplant for the species is \"Liquidambar styraciflua\".', ' They mine the leaves of their host plant.', ' The mine has the form of a long, winding, linear mine on the upperside of the leaf.', ' It is rather indistinct, without any central line of frass.']], ['Liquidambar', ['Liquidambar, commonly called sweetgum (sweet gum in the UK), gum, redgum, satin-walnut, or American storax, is the only genus in the flowering plant family Altingiaceae with 15 species.', ' They were formerly often treated in Hamamelidaceae.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 32%|███▏ | 160/500 [01:39<18:17, 3.23s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:22:40.677\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 32%|███▏ | 161/500 [01:40<15:03, 2.66s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:22:41.537\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 32%|███▏ | 162/500 [01:40<12:53, 2.29s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:22:41.619\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 5 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 33%|███▎ | 164/500 [01:41<08:25, 1.50s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:22:42.169\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a72d3525542991f9a20c5ab', 'answer': '1905', 'question': \"What was the founding year of the county where Weber's Store is located?\", 'supporting_facts': [[\"Weber's Store\", 0], ['Sanders County, Montana', 3]], 'context': [['Karaj Payam Noor University', ['Karaj Payam Noor University is located in Karaj, Iran, and has two campuses.', ' The main campus is located in Gohardasht, and another campus is located on Ghalamestan Street.', ' The university was founded in 2000-01.', \" In the school's founding year, 70 students were admitted for their BS in accounting.\", ' The university now offers 52 courses at BS, BA, BE, MS, and MBA levels and has over 15,000 students.']], ['Louaize Club', ['Louaize Club is the basketball department of Notre Dame University – Louaize , a university basketball club basked in Zouk Mosbeh.', ' The club was established in the founding year of 1978 and is currently participating in the 2016 Lebanese Basketball League.']], ['Diocesan Native Female Training School', ['Diocesan Native Female Training School (DNFTS, ) was a school under the Anglican Church of Hong Kong in the 19th century, founded in 1860 and closed down in 1868.', \" Its premises now belong to today's Bonham Road Government Primary School().\", \" In 1869, another institution called Diocesan Home and Orphanage (DHO, later renamed Diocesan School and Orphanage, and now known as Diocesan Boys' School) was founded in the same place.\", ' Due to the obvious differences in founding groups, vision of education, personnel arrangement and students’ background, DNFTS has been regarded only as a forerunner, and called ‘the First Foundation’ by DHO and later DBS.', ' Using 1869 as its founding year, DBS calls itself ‘the Second Foundation’.', \" As for Diocesan Girls' School, founded in Rose Villas near DSO in 1899, it claims to be the successor of DNFTS and traces the founding year back to 1860.\"]], ['Sanders County, Montana', ['Sanders County is a county located in the U.S. state of Montana.', ' As of the 2010 census, the population was 11,413.', ' Its county seat is Thompson Falls.', ' The county was founded in 1905.']], ['Gufo Temple', ['Gufo Temple () is located on the bank of Qingshui River, Jingangku, Shanxi province, China and is the first temple to see if entering the Mount Wutai area from the south route.', \" According to Mount Wutai's history, there are only records of the renovation of this temple, but nothing concerning its founding year.\", ' Thus, it is speculated \"Old Buddha exists before the beginning of the world.', ' Gufo Temple (Old Buddha Temple) exists before Mount Wutai.\"']], ['Muscular Dystrophy Canada', ['Muscular Dystrophy Canada (MDC) (French: Dystrophie musculaire Canada ) is a non-profit organization that strives to find a cure for neuromuscular disorders.', ' Founded in 1954 as Muscular Dystrophy Association of Canada, volunteers and staff nationwide have helped to provide support and resources to those affected.', ' Since the founding year, over $64 million has been put towards research via collaborations, fundraising events, and donations.']], ['Symphony of Southeast Texas', ['The Symphony of Southeast Texas is an American orchestra based in Beaumont, Texas.', ' The orchestra, formerly known as the \"Beaumont Symphony Orchestra\", officially started in 1953; however, the impetus can be traced back as early as 1923 with the formation of the Beaumont Music Commission.', ' The 2015-16 season is the sixty-third consecutive season since the founding year.', \" The symphony's home theater is the Julie Rogers Theater in downtown Beaumont.\", ' The symphony lists over eighty musicians in the orchestra as of 2015.']], ['Stavanger', ['Stavanger (] ) is a city and municipality in Norway.', ' The city is the third-largest urban zone and metropolitan area in Norway (through conurbation with neighbouring Sandnes) and the administrative centre of Rogaland county.', ' The municipality is the fourth most populous in Norway.', ' Located on the Stavanger Peninsula in Southwest Norway, Stavanger counts its official founding year as 1125, the year the Stavanger Cathedral was completed.', \" Stavangers core is to a large degree 18th- and 19th-century wooden houses that are protected and considered part of the city's cultural heritage.\", \" This has caused the town centre and inner city to retain a small-town character with an unusually high ratio of detached houses, and has contributed significantly to spreading the city's population growth to outlying parts of Greater Stavanger.\"]], ['Skultuna mässingsbruk', ['Skultuna Messingsbruk is a Swedish company founded in 1607 at the bequest of King Karl IX.', ' Skultuna Messingsbruk is located in Skultuna on the outskirts of Västerås.', ' The logotype of Skultuna consists of the closed royal crown, the name \"Skultuna\" and the founding year \"1607\".']], [\"Weber's Store\", [\"Weber's Store, at 510 Main St. in Thompson Falls in Sanders County, Montana was listed on the National Register of Historic Places in 1986.\", ' It has also been known as Thompson Falls Laundry.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 5 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 33%|███▎ | 165/500 [01:41<06:55, 1.24s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:22:42.187\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a8c6342554299240d9c214a', 'answer': 'Kelly Bundy', 'question': 'Claudine\\'s Return starred the actress who played which role on \"Married...with Children\"?', 'supporting_facts': [[\"Claudine's Return\", 0], ['Christina Applegate', 0]], 'context': [['Pierre Gaspard-Huit', ['Pierre Gaspard-Huit (29 November 1917 – 1 May 2017) was a French film director and screenwriter.', ' He directed the 1963 film \"Shéhérazade\", which starred Anna Karina.', ' He was once married to actress Claudine Auger when she was 18, and he was 43 years old.', ' She acted in several of his films.']], ['Alfred Rode', ['Alfred Rode (4 June 1905 – 22 July 1979) was an Italian-born French composer, musician, actor and film director.', ' He was born in Torre del Greco as Alfred Spedaliere.', ' In 1936 Rode appeared in the British film \"Gypsy Melody\" alongside Lupe Velez, which was a remake of his own 1935 film \"Juanita\".', ' Rode was married to the French actress Claudine Dupuis from 1951.']], ['Franchesca Salcedo', ['Franchesca \"Cruzita\" Salcedo (born Franchesca Salcedo on March 16, 2002 in San Pablo City, Laguna, Philippines) is a Filipina child actress .', ' She plays the title role of Cruzita Aldama Santibañez in \"MariMar\" daughter of Marimar Aldama and Sergio Santibañez.', ' Although her nickname in real life is Cruzita, her acting roles in both \"MariMar\" and \"Claudine\" had her playing a character named Cruzita, which she was given after her stint on the former.']], ['Claudine Auger', ['Claudine Auger (born Claudine Oger; 26 April 1941) is a French actress best known for her role as Bond girl Dominique \"Domino\" Derval in the James Bond film \"Thunderball\" (1965).', ' She earned the title of Miss France Monde and was also the first runner-up in the 1958 Miss World contest.']], ['Claudine Dupuis', ['Claudine Dupuis (born Andrée Esther Chaloum, 1 May 1924 in Paris – 26 May 1994 in Lisieux) was a French actress.', ' She starred as the \"garrulous prostitute Manon\" in Henri-Georges Clouzot\\'s \"Quai des Orfèvres\" in 1947.', ' Other films include \"The Fighting Men\" (1950), \"Les pépées font la loi\" (1954), \"Les pépées font la loi\" (1955), \"La fierecilla domada\" (1956) and \"Cuatro en la frontera\" (1958).', ' She was married to Alfred Rode.']], ['Bringing Up Bates', ['Bringing Up Bates is an American reality television show on Up TV.', ' It is centered around Gil and Kelly Jo Bates and their 19 children.', ' Gil and Kelly Jo got married on December 19, 1987, when he was 22 and she was 21.', ' Since then, they have had 9 boys and 10 girls, all of whom were born between the years 1988 and 2012, and Kelly Jo delivered every one of them.', ' There are no sets of multiples in their family either.', ' They have four children that are married: Zach (married Whitney Perkins), Michaella (married Brandon Keilen), Erin (married Chad Paine), and Alyssa (married John Webster).', \" Gil and Kelly Jo also have six grandchildren, two being Zach & Whitney's children, two being Chad & Erin's children, and the other two being John & Alyssa's children.\", ' The Bates family had a TV show in 2012 called \"United Bates of America\", and it was announced in October 2014 that the Bates family would return in a new series which would be called \"Bringing Up Bates\".', ' The series debuted on January 1, 2015.', ' UP TV revealed that the show would be returning for another season in June 2015.', ' The second season started on June 4, 2015.', ' The third season started on January 7, 2016 The fourth season started on June 2, 2016.', ' The fifth season started on January 5, 2017.', ' The sixth season began on June 1, 2017.']], ['Mandy Richardson', ['Mandy Richardson (also Hutchinson) is a fictional character from the British Channel 4 soap opera, \"Hollyoaks\", played by Sarah Jayne Dunn.', ' She debuted on-screen on 7 October 1996 and has been involved in such storylines including dealing with sexual abuse while she was a child by her father Dennis (David McAllister) and numerous failed relationships, the suicide of her brother Lewis, an on and off relationship with Tony Hutchinson (Nick Pickard) before the couple married.', ' Mandy and Tony had a daughter together who they named Grace, only for her to die from Sudden Infant Death Syndrome.', \" This led to the character and Dunn's exit from the serial in 2006.\", ' Dunn made a brief return in 2007 before making a return for six months in 2008.', ' Dunn again returned as Mandy in 2010 in a storyline which also saw the return of Warren Fox (Jamie Lomas).', ' In September 2011, Dunn announced her departure from the show and Mandy made her last appearance on 2 September 2011 before departing off-screen.', ' Dunn later returned to her role in the sixth series of \"Hollyoaks Later\" in October 2013.', ' In June 2017, it was announced that Dunn had reprised the role again and that Mandy would appear from July along with Luke Morgan played by Gary Lucy.', ' Mandy returned on 26 July 2017.']], ['Angela Lonsdale', ['Angela Lonsdale (born Angela Smith; 1970), is an English actress.', \" Born to a policeman father, Lonsdale's passion for acting was showcased in the Brewery Youth Theatre at the Brewery Arts Centre, Kendal.\", \" Working behind the box office, Lonsdale's talent was nurtured by the then Arts Centre Director, Anne Pierson.\", ' She took part in a large number of amateur productions, including plays by local playwrights John Newman-Holden and Tim Bull.', ' After initial rejection, Lonsdale then graduated from the Royal Scottish Academy of Music and Drama.', ' Lonsdale is best known for playing police officer Emma Taylor on \"Coronation Street\".', ' Taylor married veteran character Curly Watts, played by Kevin Kennedy.', ' After birth of their child, both characters left the programme in 2003.', ' She then took a regular part in the long-running television series \"The Bill\".', ' Lonsdale appeared as DI Eva Moore in the daytime BBC series \"Doctors\".', \" She left on 21 October 2008 after being shot and presumed dead by an old criminal acquaintance, but in actual reality left Leatherbridge for her own and Jimmi's safety.\", ' She made a brief return to \"Doctors\" in September 2011.', ' In 2012 and 2013 Lonsdale played the role of the mother in a family of wolves in children\\'s TV drama \"Wolfblood\".', ' Before they agreed on separation in 2010, Lonsdale was married to actor Perry Fenwick, who plays Billy Mitchell in \"EastEnders\".']], ['Christina Applegate', ['Christina Applegate (born November 25, 1971) is an American actress and dancer who, as an adolescent actress, started playing the role of Kelly Bundy on the Fox sitcom \"Married... with Children\" (1987–97).', ' In her adult years, Applegate established a film and television career, winning an Emmy and earning Tony and Golden Globe nominations.', ' She is also known for doing the voice of Brittany in the \"Alvin and the Chipmunks\" film series.']], [\"Claudine's Return\", [\"Claudine's Return is a movie released in 1998 starring Christina Applegate.\", ' It was filmed almost entirely on the American island of Tybee Island, Georgia with a few shots from the surrounding areas.', ' It was released as Kiss of Fire on DVD.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 5 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "\u001b[32m2025-12-07 12:22:42.266\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 5 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:22:46.751\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 7 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 34%|███▎ | 168/500 [01:46<07:37, 1.38s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:22:46.830\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 7 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:22:47.141\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 34%|███▍ | 170/500 [01:46<05:32, 1.01s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:22:47.202\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:22:47.212\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:22:47.255\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 35%|███▍ | 173/500 [01:46<03:22, 1.61it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:22:47.379\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:22:47.389\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 35%|███▌ | 175/500 [01:46<02:32, 2.14it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:22:47.420\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 35%|███▌ | 177/500 [01:47<01:55, 2.79it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.17391304347826084, 'em': 0.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:22:47.726\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:22:53.297\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "\u001b[32m2025-12-07 12:22:53.299\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 36%|███▌ | 179/500 [01:52<05:47, 1.08s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:22:53.394\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 36%|███▋ | 182/500 [01:53<03:45, 1.41it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.25, 'em': 0.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:22:53.748\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 37%|███▋ | 183/500 [01:53<03:16, 1.61it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:22:53.796\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 37%|███▋ | 185/500 [01:55<04:05, 1.29it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 38%|███▊ | 190/500 [01:56<01:53, 2.73it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.3076923076923077, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 39%|███▉ | 196/500 [01:56<00:55, 5.44it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5333333333333333, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 40%|███▉ | 198/500 [01:56<00:49, 6.13it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 40%|████ | 200/500 [01:56<00:44, 6.69it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 41%|████ | 204/500 [01:57<00:37, 7.97it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.3333333333333333, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.14285714285714288, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 42%|████▏ | 209/500 [01:57<00:24, 11.82it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 43%|████▎ | 215/500 [01:57<00:16, 17.46it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.7777777777777778, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 44%|████▎ | 218/500 [01:58<00:30, 9.39it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 45%|████▍ | 224/500 [01:58<00:23, 11.86it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8571428571428571, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:23:00.272\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 45%|████▌ | 226/500 [01:59<00:50, 5.43it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 46%|████▌ | 229/500 [02:00<00:42, 6.37it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 46%|████▌ | 231/500 [02:00<00:50, 5.38it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 0.21428571428571425, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 46%|████▋ | 232/500 [02:00<00:46, 5.72it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 48%|████▊ | 239/500 [02:01<00:31, 8.41it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 49%|████▉ | 245/500 [02:01<00:26, 9.47it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 49%|████▉ | 247/500 [02:02<00:30, 8.18it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8333333333333333, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 51%|█████ | 253/500 [02:02<00:23, 10.73it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:23:08.065\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 15 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 51%|█████▏ | 257/500 [02:07<02:08, 1.89it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 52%|█████▏ | 258/500 [02:07<01:53, 2.13it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 52%|█████▏ | 259/500 [02:22<12:43, 3.17s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:23:23.518\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a829c1d55429966c78a6a67', 'answer': 'court in Åbo', 'question': \"What did the man who led Sweden to military supremacy during the Thirty Years' War found in 1623?\", 'supporting_facts': [['Hovrätt', 2], ['Gustavus Adolphus of Sweden', 1]], 'context': [['Johann von Geyso', [\"Johann von Geyso (1593 – 1661) was a German nobleman and General-Lieutenant, who fought during the course of the Thirty Years' War.\", ' After studying in a Dutch military academy, Geyso fought as a mercenary in the armies of Sweden, Bohemia, Denmark and the German Protestant Union.', \" In 1628, having gained significant experience in warfare he returned to his native Hesse-Kassel which he served until the end of the Thirty Years' War, reaching the rank of commander in chief of the Langraviate's forces and becoming ennobled.\"]], ['Wallenstein (novel)', ['Wallenstein is a 1920 historical novel by German author Alfred Döblin.', \" Set in Central Europe during the Thirty Years War, the novel's plot is organized around the polar figures of Ferdinand II, Holy Roman Emperor, on the one hand, and Albrecht von Wallenstein, on the other.\", \" Döblin's approach to narrating the war differed from prevailing historiography in that, rather than interpreting the Thirty Years War primarily as a religious conflict, he portrays it critically as the absurd consequence of a combination of national-political, financial, and individual psychological factors.\", ' Döblin saw a strong similarity between the Thirty Years War and the First World War, during which he wrote \"Wallenstein\".', ' The novel is counted among the most innovative and significant historical novels in the German literary tradition.', ' In large part, contemporary critics found the novel to be difficult, dense, and chaotic—a reception Döblin discussed in his 1921 essay \"The Epic Writer, His Material, and Criticism\"—yet writers such as Lion Feuchtwanger, Franz Blei, and Herbert Ihering praised \"Wallenstein\" for its formal innovation, poetic language, epic scope, and bold departure from other German writing of the time.', ' Despite the novel\\'s difficulty, the critical consensus was that \"Wallenstein\" was a major achievement and confirmed the promise seen in Döblin\\'s earlier historical novel, \"The Three Leaps of Wang Lun\".']], ['Hovrätt', ['Hovrätt (Finnish: Hovioikeus ) (literally \"Royal Court\") was the highest judicial body in Sweden until King Gustav III founded the Supreme Court of Sweden in 1789.', ' The first hovrätt, Svea hovrätt, was founded 1614 in Stockholm.', ' In Finland, then a part of Sweden, the court in Åbo was founded in 1623 by Gustavus Adolphus, mainly due to the distance to Stockholm.', ' Today, these courts mostly function as an appellate court, the second highest judicial body in both Sweden and Finland.']], [\"Swedish intervention in the Thirty Years' War\", [\"The Swedish invasion of the Holy Roman Empire, or the Swedish Intervention in the Thirty Years' War is a historically accepted division of the Thirty Years' War.\", \" It was a military conflict that took place between 1630 and 1635, during the course of the Thirty Years' War.\", ' It was a major turning point of the war, as during this time, the Protestant cause, previously on the verge of defeat, won several major victories and snatched victory away from the Habsburg-Catholic coalition.', ' It is often considered to be an independent conflict by most historians.']], ['The Last Valley (novel)', [\"The Last Valley (1959), by J. B. Pick, is an historical novel about the Thirty Years' War (1618–1648).\", ' The story occurs from September 1637 to March 1638, and centres on two men – a mercenary soldier and an intellectual – who are fleeing the destruction and starvation wrought by religious war.', ' In southern Germany, each man stumbles upon a fertile valley untouched by the war.', ' Soldier and intellectual, man of arms and man of mind, must collaborate to preserve the peace and plenty of the last valley from the stress and strain of the religious bigotry that caused thirty years of war in Europe.']], ['John Ruthven (general)', [\"John Ruthven was a military officer who served in Denmark and Sweden during the Thirty Years' War before returning for brief service in the British Civil Wars.\", ' He served first as a captain in Danish service from 1627.', ' As King Christian IV of Denmark-Norway made peace with the Habsburg Emperor in 1629 Ruthven, along with many other Scottish soldiers in Danish service, then turned to Sweden to continue the war.', ' He first appears in Swedish service in 1629 serving as a captain of the Scottish infantry at Stralsund under the command of Alexander Leslie.', \" He was soon promoted lieutenant-colonel in Leslie's infantry regiment (by 1630) and led an infantry-regiment in the battle of Breitenfeld on 17 September 1631 as full colonel.\", ' He later took part in the battle at the Alte Veste near Nuernberg on 3 September 1632, and later took part in the bloody conquest of Landsberg/Lech (Bavaria) under the command of Lennart Torstensson.']], ['Military history of Iran', ['With thousands of years of recorded history, and due to an unchanging geographic (and subsequently geopolitical) condition, Iran (previously known as Persia in the West until 1935) has had a long, varied, and checkered military culture and history, ranging from triumphant and unchallenged ancient military supremacy affording effective superpower status in its day, to a series of near catastrophic defeats (beginning with the destruction of Elam) at the hand of previously subdued and conquered peripheral nations (including Greece, Macedon and the Asiatic nomadic tribes at the Eastern boundary of the lands traditionally home to the Iranian people).']], ['Hakkapeliittain Marssi', ['Hakkapeliittain marssi (\"March of the Hakkapeliittas\") or Finska Rytteriets Marsch \"in Swedish\" (\"March of the Finnish Cavalry\"), also known as Suomalaisen ratsuväen marssi 30-vuotisessa sodassa or Finska rytteriets marsch i trettioåriga kriget (\"March of the Finnish cavalry in 30 years war\") is one of the Finnish and Swedish cavalry\\'s battle marches and one of the oldest currently played.', \" It originates from the times of Thirty Years' War when Finnish cavalrymen were known as hakkapeliitta and it became popular with military bands.\", ' It was given lyrics (in Swedish) in 1872 by Zacharias Topelius and is commonly known as the \"March of the Finnish Cavalry during the Thirty Years War\".', ' The Prussian army officially adopted it for use in 1891; it is now a standard of the German marching band repertoire.']], ['Charles X Gustav of Sweden', ['Charles X Gustav, also Carl Gustav (Swedish: \"Karl X Gustav\" ; 8 November 1622 – 13 February 1660), was King of Sweden from 1654 until his death.', ' He was the son of John Casimir, Count Palatine of Zweibrücken-Kleeburg and Catherine of Sweden.', \" After his father's death he also succeeded him as Pfalzgraf.\", ' He was married to Hedwig Eleonora of Holstein-Gottorp, who bore his son and successor, Charles XI.', ' Charles X Gustav was the second Wittelsbach king of Sweden after the childless king Christopher of Bavaria (1441–1448) and he was the first king of the Swedish \"Caroline era\", which had its peak during the end of the reign of his son, Charles XI.', ' He led Sweden during the Second Northern War, enlarging the Swedish Empire.', ' By his predecessor Christina, he was considered \"de facto\" Duke of Eyland (Öland) before ascending to the Swedish throne.']], ['Gustavus Adolphus of Sweden', ['Gustav II Adolf (9 December 1594 – 6 November 1632, O.S.), widely known in English by his Latinised name Gustavus Adolphus or as Gustav II Adolph, was the King of Sweden from 1611 to 1632 and is credited as the founder of Sweden as a Great Power (Swedish: \"Stormaktstiden\" ).', \" He led Sweden to military supremacy during the Thirty Years' War, helping to determine the political as well as the religious balance of power in Europe.\", ' He was formally and posthumously given the name Gustavus Adolphus the Great (Swedish: \"Gustav Adolf den store\" , Latin: \"Gustavus Adolphus Magnus\" ) by the Riksdag of the Estates in 1634.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 17 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:23:23.530\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a77c15f5542997042120b1c', 'answer': 'Leafcutter John', 'question': 'In the British experimental jazz band Polar Bear, who handles digital devices?', 'supporting_facts': [['Polar Bear (British band)', 0], ['Electronic musical instrument', 0]], 'context': [['Polar Bear (album)', [\"Polar Bear is the eponymous third album by Sebastian Rochford's British jazz band Polar Bear.\"]], ['Electronic musical instrument', ['An electronic musical instrument is a musical instrument that produces sound using electronic circuitry and/or digital devices.', ' Such an instrument sounds by outputting an electrical, electronic or digital audio signal that ultimately is plugged into a power amplifier which drives a loudspeaker, creating the sound heard by the performer and/or listener.']], ['Dim Lit', ['Dim Lit is the debut album by British jazz band Polar Bear, formed and led by drummer Sebastian Rochford.']], ['Shirokuma Cafe', ['Shirokuma Cafe (Japanese: しろくまカフェ , Hepburn: Shirokuma Kafe , lit.', ' \"Polar Bear Café\") is a Japanese manga series by Aloha Higa (ヒガ アロハ , Higa Aroha ) .', ' It revolves around the everyday lives of a group of animals mingling with humans at a café run by a polar bear.', ' An anime adaptation by Studio Pierrot aired in Japan between April 2012 and March 2013.', ' While it never received an official international release (mostly due to its heavy emphasis on Japanese wordplay, which complicates the potential for dubbing into other languages), it is available on the streaming website Crunchyroll as Polar Bear Cafe alongside the television broadcast for global audiences.']], ['Polar Bear (locomotive)', ['Polar Bear is a Bagnall steam locomotive built in 1905 for the Groudle Glen Railway, to supplement the similar but slightly smaller \"Sea Lion\".', ' The two Bagnalls were temporarily taken out of service in the 1920s when they were replaced by a pair of battery locomotives.', ' These proved unsatisfactory, and \"Polar Bear\" and \"Sea Lion\" were returned to traffic.', ' The railway was closed for the duration of World War II, and when the line reopened in the late 1940s only \"Polar Bear\" was returned to traffic.', ' Following the 1962 closure of the GGR, \"Polar Bear\" was sold to the Brockham Museum Trust in 1967.', ' In 1982 it passed, with the rest of the Brockham collection, to the Amberley Museum Railway, where it was returned to traffic in the early 1980s.', ' \"Polar Bear\"\\'s boiler was condemned around 1988, returning to service with a new boiler in 1993.', ' Its boiler certificate expired at the end of 2010; with a retube and work on the firebox being required before a return to service.', ' Since being based at Amberley, \"Polar Bear\" has returned to the Groudle Glen on three occasions (1993, 1996 and 2005) to visit.']], ['Held on the Tips of Fingers', [\"Held On The Tips Of Fingers is the second album by Sebastian Rochford's British jazz band Polar Bear.\"]], ['Polar Bear (British band)', ['Polar Bear is a British experimental jazz band led by drummer Seb Rochford with Pete Wareham and Mark Lockheart on tenor saxophone, Tom Herbert on double bass and Leafcutter John on electronics and occasionally guitar or mandolin.']], ['Same as You', ['Same as You is the sixth studio album by British jazz band Polar Bear.', ' It was released on 30 March 2015 by The Leaf Label.']], ['In Each and Every One', [\"In Each and Every One is the fifth album by Sebastian Rochford's British jazz band Polar Bear.\"]], ['Peepers (album)', [\"Peepers is the fourth album by Sebastian Rochford's British jazz band Polar Bear.\"]]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 17 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0.4444444444444445, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 53%|█████▎ | 264/500 [02:23<05:25, 1.38s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:23:23.643\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a713f3c5542994082a3e6eb', 'answer': 'Jacksonville station', 'question': 'Where does the train that runs from NYC and Miami station at in Florida?', 'supporting_facts': [['Jacksonville station', 0], ['Jacksonville station', 1], ['Silver Meteor', 0]], 'context': [['Chapman Field (Miami)', ['Chapman Field (officially the Subtropical Horticulture Research Station) is a horticulture and agronomy research facility of the Agricultural Research Service, a division of the United States Department of Agriculture (USDA), located in Miami, Florida.', ' Dating from 1898, it is one of the oldest entities in South Florida.', ' The USDA also refers to it as the Miami Station.']], ['Silver Meteor', ['The Silver Meteor is a passenger train operated by Amtrak between New York City and Miami, Florida.', ' The first diesel-powered streamliner between New York and Florida, since being introduced by the Seaboard Air Line Railroad (SAL) in 1939, it remains in operation now.', ' The train is part of Amtrak\\'s \"Silver Service\" along with the \"Silver Star\", another former SAL streamliner.']], ['South Miami station', ['South Miami station is a station on the Metrorail rapid transit system in South Miami, Florida.', ' This station is located at the intersection of South Dixie Highway (US 1) and Sunset Drive (SW 72nd Street/SR 986), two blocks west of Red Road (West 57th Avenue).', ' It opened to service May 20, 1984.']], ['Lakeland station', ['Lakeland station is a train station in Lakeland, Florida, that is served by Amtrak, the national passenger rail system of the United States.', ' It is served by the \"Silver Star\" train, which runs daily between New York City and Miami.', ' The station is located on the northern shore of Lake Mirror.']], ['Miami Station, Missouri', ['Miami Station is an unincorporated community in Carroll County, Missouri, United States.', ' Miami Station is located along Missouri Supplemental Route V 2.5 mi northwest of Miami.', ' Miami Station was laid out in 1870 as a station on the St. Louis, Kansas City and Northern Railway; it served as the main freight station for Miami.', ' A post office called Miami Station was established in 1869, and remained in operation until 1951.', ' U.S. Senator William A. Blakley was born in Miami Station.']], ['JMWAVE', ['JMWAVE or JM/WAVE or JM WAVE was the codename for a major secret United States covert operations and intelligence gathering station operated by the CIA from 1961 until 1968.', ' It was headquartered in Building 25 on the South Campus of the University of Miami in Miami, Florida.', ' (This location was formerly the site of Richmond Naval Air Station, an airship base about 12 miles south of the main campus; after the airship base closed, it has been used by the University of Miami since 1948.)', ' The intelligence facility was also referred to as the CIA\\'s \"Miami Station\" or \"Wave Station\".']], ['Jacksonville station', ['Jacksonville station is an Amtrak train station in Jacksonville, Florida, United States.', ' It serves the \"Silver Meteor\" and \"Silver Star\" trains as well as the Thruway Motorcoach to Lakeland.', \" The station lies next door to a freight facility with its own platform and is also just east of Norfolk Southern's Simpson Yard.\"]], ['Miami Worldcenter', ['Miami Worldcenter is a large mixed-use development under construction led by principals Arthur Falcone and Nitin Motwani, spanning several blocks in the Park West neighborhood of Miami, Florida, just north of Downtown.', \" It may include over 25 acres of land, with a convention center, hotel space, residential, as well as copious street level retail and large anchor tenant space, such as Macy's and Bloomingdale's.\", ' The hotel and convention center are planned to be part of the same 55 storey building.', ' The hotel will be very large with 1,800 rooms over the approximately 600000 sqft convention center.', ' One proposed residential building known as the Miami Worldcenter Signature Tower may rise to the maximum 749 ft above sea level permitted in that area.', \" The project may connect with the under construction All Aboard Florida intercity higher-speed rail system's Miami station.\"]], ['Government Center (Miami)', ['Government Center is a district in Downtown Miami, Florida.', ' Bounded roughly by I-95 and NW 3rd Avenue to the west, SW 1st Street to the south, NW 5th Street to the north, and NE 1st Avenue to the east, Government Center is located on the western edge of downtown.', ' The area includes several courthouses, including the historic Miami-Dade County Courthouse and a US district court, the City of Miami police headquarters, city, county, and state offices.', ' The eponymous and most used county transit station, Government Center, serving Metrorail, Metromover, and Metrobus, is located in the bottom of the Stephen P. Clark Government Center building.', ' Directly south of this is the main branch of the Miami-Dade Public Library System, as well as the HistoryMiami museum.', \" Henry Flagler's Florida East Coast Railroad owns roughly nine acres in the middle of Government Center, the site of its former Miami station, which spans several blocks.\", ' While the station was destroyed in 1963 and the site had been used as surface parking lots in the decades following, the railroad never gave up ownership of the property.', ' In mid 2014, the lots were closed down for construction of a new Downtown Miami intercity rail station, as part of their All Aboard Florida system.']], ['Miami station (Amtrak)', ['Miami station is a train station in Miami-Dade County, Florida, on the border of Miami and Hialeah.', ' It is the southern terminus for Amtrak\\'s \"Silver Meteor\" and \"Silver Star\" trains.', ' The station opened in 1978 to replace a 48-year-old Seaboard Air Line Railroad station.', ' It is several blocks away from the Tri-Rail and Metrorail Transfer Station, but there is no direct connection between the stations.', ' The station was scheduled to be replaced by Miami Central Station in Fall 2016, but was delayed to late 2017.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 17 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0.4444444444444445, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 54%|█████▎ | 268/500 [02:23<03:19, 1.16it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5454545454545454, 'em': 0.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:23:23.868\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a8732505542991e77181713', 'answer': 'the Nazi Party', 'question': 'Friedrich Ratzel coined the phrase Lebensraum; which political movement made it notorious by taking it to extremes?', 'supporting_facts': [['Friedrich Ratzel', 0], ['Lebensraum', 2]], 'context': [['Musar movement', ['The Musar movement (also Mussar movement) is a Jewish ethical, educational and cultural movement that developed in the 19th century in Lithuania, particularly among Orthodox Lithuanian Jews.', ' The Hebrew term \"Musar\" (), is from the book of \"Proverbs\" 1:2 meaning moral conduct, instruction or discipline.', ' The term was used by the Musar movement to refer to efforts to further ethical and spiritual discipline.', ' The Musar Movement made significant contributions to Musar literature and Jewish Ethics.']], ['Black Power movement', ['The Black Power movement was a political movement to achieve a form of Black Power and the many philosophies it contains.', ' The movement saw various forms of activism some violent and some peaceful, all hoping to achieve black empowerment.', ' The Black Power movement also represented socialist movements, all with the general motivation of improving the standing of black people in society.', ' Originated during the Civil Rights Movement, some doubted the philosophy of the movement begging for more radical action, taking influences from Malcolm X.', ' The cornerstone of the movement was the Black Panther Party, a Black Power organization dedicated to socialism and the use of violence to achieve it.', ' The Black Power movement developed amidst the criticisms of the Civil Rights Movement in the early 1960s, and over time and into the 1970s, the movement grew and became more violent.', ' After years of violence, many left the movement and the police began arresting violent actors in the movement.', ' The Black Power movement also spilled out into the Caribbean creating the Black Power Revolution.']], ['Rudolf Kjellén', ['Johan Rudolf Kjellén (] , 13 June 1864, Torsö – 14 November 1922, Uppsala) was a Swedish political scientist and politician who first coined the term \"geopolitics\".', ' His work was influenced by Friedrich Ratzel.', ' Along with Alexander von Humboldt, Karl Ritter, and Ratzel, Kjellén would lay the foundations for the German \"Geopolitik\" that would later be espoused prominently by General Karl Haushofer.']], ['Lebensraum', ['The German concept of Lebensraum (] , English: \"living space\" ) refers to policies and practices of settler colonialism which proliferated in Germany from the 1890s to the 1940s.', ' First popularized around 1901, \"Lebensraum\" became a geopolitical goal of Imperial Germany in World War I (1914–1918) originally, as the core element of the \"Septemberprogramm\" of territorial expansion.', ' The most extreme form of this ideology was supported by the Nazi Party (NSDAP) and Nazi Germany until the end of World War II.']], ['History of Corsica', ['That the history of Corsica has been influenced by its strategic position at the heart of the western Mediterranean and its maritime routes, only 12 km from Sardinia, 50 km from the Isle of Elba, 80 km from the coast of Tuscany and 200 km from the French port of Nice, was first proposed by the 19th-century German theorist, Friedrich Ratzel.', ' To him is often attributed the description \"mountain in the sea\".', ' Regardless of whether he used that particular phrase the idea is expressed in his magnum opus, \"Anthropogeographie\", which calls Corsica']], ['Al-Ard', ['Al-Ard (Arabic: الارض\\u200e \\u200e , \"The Land\") was a Palestinian political movement made up of Arab citizens of Israel active between 1958 and some time in the 1970s which attracted international attention.', ' Following unsuccessful efforts to secure registration of the organization as an Israeli NGO and secure it a publishing permit, it was outlawed in 1964.', ' The political movement\\'s goal was, according to political historian David McDowall, \"to achieve complete equality and social justice for all classes of people in Israel\" and \"to find a just solution for the Palestine problem as a whole, and as an indivisible unit.\"', \" Al-Ard's disappearance as a movement was linked both to governmental and popular resistance, with the Israeli Community Party denouncing the group and Palestinian Arab communities inside of Israel concerned that Al-Ard might destroy them.\"]], ['Nine-Hour Movement', ['The Nine-Hour Movement started in Canada in 1872, based out of Hamilton, Ontario.', \" This marked Canada's first national attempt at a labour movement, pushing for the nine-hour work day which united both unionized and non-unionized workers alike.\", \" The movement came to its height in May 1872 when a collective force of 1,500 workers demonstrated in Hamilton in a parade-style fashion, which is coined as being the precursor to the traditional holiday of Canada's Labour Day.\", ' Although the movement was an overall failure, as it failed to deliver the nine-hour work day to the majority of work forces and industries, this movement made a major mark in labour relations in Canada.']], ['Gongche Shangshu movement', ['The Gongche Shangshu movement (Traditional Chinese: 公車上書, Simplified Chinese: 公车上书) was a political movement in late Qing dynasty China, seeking reforms and expressing opposition to the Treaty of Shimonoseki in 1895.', ' It is considered the first modern political movement in China.', \" Leaders of the movement later became leaders of the Hundred Days' Reform.\"]], ['Friedrich Ratzel', ['Friedrich Ratzel (August 30, 1844 – August 9, 1904) was a German geographer and ethnographer, notable for first using the term \"Lebensraum\" (\"living space\") in the sense that the National Socialists later would.']], ['Grassroots', ['A grassroots movement (often referenced in the context of a political movement) is one which uses the people in a given district as the basis for a political or economic movement.', ' Grassroots movements and organizations use collective action from the local level to effect change at the local, regional, national, or international level.', ' Grassroots movements are associated with bottom-up, rather than top-down decision making, and are sometimes considered more natural or spontaneous than more traditional power structures.', ' Grassroots movements, using self-organization, encourages community members to contribute by taking responsibility and action for their community.', ' Grassroots movements utilize a variety of strategies from fundraising and registering voters, to simply encouraging political conversation.', ' Goals of specific movements vary, but the movements are consistent in their focus on increasing mass participation in politics.', ' These political movements may begin as small and at the local level, but grassroots politics as Cornel West contends are necessary in shaping progressive politics as they bring public attention to regional political concerns']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 17 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "\u001b[32m2025-12-07 12:23:23.878\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5ae122465542997b2ef7d0f4', 'answer': 'the Allies and Nazi Germany', 'question': 'The uncle of Paul Capellani died at a military operation during a battle was fought between who?', 'supporting_facts': [['Paul Capellani', 0], ['Paul Capellani', 1], ['Battle of Dunkirk', 0], ['Battle of Dunkirk', 1]], 'context': [['Operation Mersad', ['Operation Mersad (Persian: عملیات مرصاد\\u200e \\u200e , meaning \"ambush\") was the last major military operation of the Iran–Iraq War, ending in a decisive victory for Iran.', ' The operation involved a successful counterattack against a July 1988 military incursion from Iraq, by a military force of about 7,000 members of the Mujahadeen-e-Khalq (MEK).', ' The MEK soldiers were armed, equipped and given air support by Iraq.', ' Led by Lt. General Ali Sayad Shirazi, Operation Mersad began on 26 July 1988 and lasted only a few days, where the Iranian Armed Forces crushed the MEK in what was the last military operation of any significance of the war.']], ['Paul Capellani', ['Paul Capellani (September 9, 1877 – November 7, 1960) was a noted French silent film actor.', ' His brother was the director Albert Capellani and his uncle the film director Roger Capellani who died May 1940 at the Battle of Dunkirk.']], ['Albert Capellani', ['Albert Capellani (23 August 1874 – 26 September 1931) was a French film director and screenwriter of the silent era.', ' He directed films between 1905 and 1922.', ' One of his brother was the actor-sculptor Paul Capellani.', ' and another the film director Roger Capellani.']], ['Roger la Honte (1913 film)', [\"Roger la Honte or A Man's Shadow is a 1913 French silent historical drama film directed by Adrien Caillard and starring Georges Dorival, Paul Capellani and Henri Collen.\", ' It is an adaption of the novel of the same title by Jules Mary, which has been filmed a further four times since.']], ['Battle of Dunkirk', ['The Battle of Dunkirk was a military operation that took place in Dunkirk (Dunkerque), France, during the Second World War.', ' The battle was fought between the Allies and Nazi Germany.', ' As part of the Battle of France on the Western Front, the Battle of Dunkirk was the defence and evacuation of British and Allied forces in Europe from 26 May to 4 June 1940.']], ['Operation Balavegaya', ['Operation Balavegaya (Operation Power force) was a combined military operation launched by the Sri Lankan military in Jaffna, the largest amphibious assault in its history.', ' Operation Balavegaya was launched in response to the siege of Elephant Pass by the LTTE.', ' It is believed that Operation Balavegaya was the largest and most successful military operation of the Sri Lankan military until Operation Riviresa in 1995.']], ['Roger Capellani', ['Roger Capellani (31 January 1905 – 30 May 1940) was a French film director, the son of film director and screenwriter Albert Capellani and the nephew of the actor Paul Capellani.']], ['La Bohème (1916 film)', ['La Bohème (aka:La vie de Bohème) is a 1916 silent historical film directed by Albert Capellani and distributed by World Pictures.', ' The star of this version is Alice Brady, whose father William A. Brady was the founder of World Pictures.', ' This film is one of many silent versions, actually the third or fourth.', ' Later silent versions appeared in 1917 and 1926 starring Lillian Gish.', \" Director Albert Capellani's brother, Paul Capellani, who appears in this film, had made his own short version in 1912.\"]], ['Camille (1915 film)', ['Camille is a 1915 American silent film based on the story \"La Dame aux Camélias\" (\"The Lady of the Camellias\") by Alexandre Dumas, \"fils\", first published in French as a novel in 1848 and as a play in 1852.', ' Adapted for the screen by Frances Marion, \"Camille\" was directed by Albert Capellani and starred Clara Kimball Young as Camille and Paul Capellani as her lover, Armand.']], ['Patrie (1917 film)', ['Patrie is a 1917 French film by Albert Capellani after the drama of Victorien Sardou.', ' The film featured Henry Krauss as the Count of Rysoor, Paul Capellani as Karloo Van der Noot, Léon Bernard as Ionas, and Maxime Desjardins as the Duke of Alba.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 17 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 54%|█████▍ | 270/500 [02:39<03:17, 1.16it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:23:40.404\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 7 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 54%|█████▍ | 271/500 [02:39<08:33, 2.24s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:23:40.451\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 7 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:23:41.383\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "\u001b[32m2025-12-07 12:23:41.383\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 55%|█████▌ | 275/500 [02:40<05:21, 1.43s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.5625, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:23:46.953\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 7 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 55%|█████▌ | 277/500 [02:46<06:31, 1.76s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 56%|█████▌ | 278/500 [02:46<05:40, 1.53s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:23:47.475\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 56%|█████▌ | 279/500 [02:46<04:46, 1.30s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:23:47.563\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:23:47.628\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 56%|█████▌ | 281/500 [02:47<03:12, 1.13it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:23:47.675\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:23:47.676\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:23:47.730\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 57%|█████▋ | 284/500 [02:47<01:53, 1.91it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:23:53.533\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 57%|█████▋ | 286/500 [02:52<04:17, 1.21s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:23:53.546\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:23:53.558\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:23:53.579\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.23529411764705882, 'em': 0.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:23:53.631\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:23:53.648\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 58%|█████▊ | 292/500 [02:53<01:54, 1.81it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:23:53.714\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:23:53.754\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:23:53.989\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "\u001b[32m2025-12-07 12:23:53.991\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 2 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 59%|█████▉ | 295/500 [02:53<01:27, 2.34it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 59%|█████▉ | 297/500 [02:54<01:28, 2.28it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 60%|██████ | 301/500 [02:56<01:23, 2.38it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 61%|██████ | 305/500 [02:56<00:51, 3.81it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 62%|██████▏ | 309/500 [02:56<00:34, 5.53it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 62%|██████▏ | 311/500 [02:56<00:30, 6.29it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 63%|██████▎ | 313/500 [02:57<00:24, 7.56it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 63%|██████▎ | 317/500 [02:57<00:22, 8.06it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 64%|██████▍ | 319/500 [02:57<00:24, 7.49it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6666666666666665, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 64%|██████▍ | 322/500 [02:58<00:21, 8.12it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 66%|██████▌ | 329/500 [02:58<00:14, 12.19it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 67%|██████▋ | 335/500 [02:58<00:09, 16.86it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 68%|██████▊ | 338/500 [03:00<00:25, 6.27it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 68%|██████▊ | 340/500 [03:00<00:24, 6.51it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 69%|██████▉ | 346/500 [03:01<00:21, 7.33it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 70%|██████▉ | 348/500 [03:01<00:19, 7.94it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.04444444444444445, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 70%|███████ | 350/500 [03:01<00:20, 7.19it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5454545454545454, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 70%|███████ | 352/500 [03:02<00:22, 6.55it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 71%|███████ | 356/500 [03:02<00:15, 9.26it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.3333333333333333, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 72%|███████▏ | 362/500 [03:02<00:13, 10.55it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.4444444444444445, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 73%|███████▎ | 367/500 [03:08<01:11, 1.87it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 73%|███████▎ | 367/500 [03:19<01:11, 1.87it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:24:23.560\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5ae2a9c6554299492dc91c42', 'answer': 'Tumi Holdings, Inc.', 'question': 'Which New Jersey-based manufacturer of suitcases and bags for travel is located in the Shops at Columbus Circle in New York City?', 'supporting_facts': [['The Shops at Columbus Circle', 0], ['The Shops at Columbus Circle', 1], ['Tumi Inc.', 0]], 'context': [['Per Se (restaurant)', ['Per Se is a New American and French restaurant located on the fourth floor of the Time Warner Center at 10 Columbus Circle (at West 60th Street and Broadway) in Manhattan in New York City, owned by chef Thomas Keller.', ' In 2011, it was called the best restaurant in New York City by \"The New York Times\".', ' The chef is Eli Kaimeh.', ' Per Se is currently the third most expensive restaurant in the world after Sublimotion and Urasawa with an average guest spending approximately $851.']], ['Campuses of Fordham University', ['The Campuses of Fordham University are located within New York City and the New York City metropolitan area.', \" The university's original Rose Hill campus is located in The Bronx on Fordham Road, while the Lincoln Center campus is located in Manhattan, one block west of Columbus Circle.\", ' The Westchester campus is located in Harrison, New York in Westchester County.', ' Additionally, Fordham University maintains a study abroad center in the United Kingdom and field offices in Spain and South Africa.']], ['Tumi Inc.', ['Tumi Holdings, Inc., is a South Plainfield, New Jersey-based manufacturer of suitcases and bags for travel.', ' Founded in 1975 by Charlie Clifford after a stint in the Peace Corps in Peru, the company is named after a Peruvian ceremonial knife used for sacrifices.', ' Tumi, Inc. was a unit of Doughty Hanson & Co. from 2004 until after its 2012 initial public offering.']], ['Time Warner Center', ['Time Warner Center is a mixed use (office/commercial and residential) twin-tower building in New York City.', ' Developed by The Related Companies and AREA Property Partners (formerly known as Apollo Real Estate Advisors), its design by David Childs and Mustafa Kemal Abadan of Skidmore, Owings & Merrill, consists of two 750 ft twin towers bridged by a multi-story atrium containing upscale retail shops.', ' Construction began in November 2000, following the demolition of the New York Coliseum, and a topping-out ceremony was held on February 27, 2003.', ' The property had the highest-listed market value in New York City, $1.1 billion, in 2006.', ' Originally constructed as the AOL Time Warner Center, the building encircles the western side of Columbus Circle and straddles the border between Midtown and the Upper West Side.', ' The total floor area of 2.8 e6ft2 is occupied by office space (notably the offices of Time Warner and an R&D Center for VMware), residential condominiums, and the Mandarin Oriental, New York hotel.', ' The Shops at Columbus Circle is an upscale shopping mall located in a curving arcade at the base of the building, with a large Whole Foods Market grocery store on the lower level.']], ['Columbus Circle (Syracuse, New York)', ['Columbus Circle is a neighborhood and plaza in the downtown section of Syracuse, New York.', ' Columbus Monument was designed by the Syracuse-born architect, Dwight James Baum in 1934.', \" Columbus Circle is home to Syracuse's two cathedrals, the Episcopalian St. Paul's Cathedral and the Roman Catholic Cathedral of the Immaculate Conception.\", \" As well as County Court House and the County's John H. Mulroy Civic Center, home of the Onondaga County Government.\"]], ['The Shops at Columbus Circle', ['The Shops at Columbus Circle is an urban shopping mall in the Time Warner Center in Manhattan, New York City — a complex of skyscrapers that was completed in 2003.', ' It is located at Columbus Circle, next to the southwestern corner of Central Park.', \" The shopping mall includes Amazon Books, H&M, L'Occitane, Michael Kors, Hugo Boss, Tumi, Coach, Cole Haan, Thomas Pink, J.Crew and Stuart Weitzman.\", ' The mall also has several restaurants such as the Michelin 3-star Per Se, Masa (allegedly the most expensive restaurant in New York ), the East Coast flagship of Williams-Sonoma, and a Whole Foods Market.', ' It is owned by The Related Companies.']], ['Forman Mills', ['Forman Mills, Inc. is a Pennsauken, New Jersey-based retail chain and department store with 35 stores, located in Philadelphia, Baltimore, Delaware, New Jersey, Washington DC, Chicago, Cleveland, Detroit, New York City and their suburbs.', ' They also operate a store at the Iverson Mall in Hillcrest Heights, Maryland.', ' It was begun by Richard Forman when he started selling items at the Columbus Farmers Market.', ' The chain is known for their low-priced designer clothing such as shirts, pants, shorts, capri pants, and hats.']], ['Chris Doyle (artist)', ['Chris Doyle is a multi-media artist who lives in New York City.', ' His major public projects have included BRIGHT CANYON, presented by the Times Square Alliance (2014); LEAP, presented by Creative Time in Columbus Circle (2000) and Commutable, presented by the Public Art Fund on the Lower East Side (1996), all in New York City.', ' His work has also been shown at The Brooklyn Museum of Art, The Queens Museum of Art, P.S.1 Museum of Contemporary Art, the Kupferstichkabinett Berlin, Germany, and as part of the New York Video Festival at Lincoln Center.', ' In 2015 he created a major immersive sculpture, video and sound piece for Wave Hill Botanical Gardens in New York.']], ['Columbus Circle', ['Columbus Circle, named for Christopher Columbus, is a traffic circle and heavily trafficked intersection in the New York City borough of Manhattan, located at the intersection of Eighth Avenue, Broadway, Central Park South (West 59th Street), and Central Park West, at the southwest corner of Central Park.', ' It is the point from which all official distances from New York City are measured.', ' The name is also used for the neighborhood a few blocks around the circle in each direction.', ' To the south of the circle lies Hell\\'s Kitchen, also known as \"Clinton\", and the Theater District, and to the north is the Upper West Side.']], ['2 Columbus Circle', ['2 Columbus Circle is a 12-story building located on a small, trapezoidal lot on the south side of Columbus Circle on the Upper West Side of Manhattan, New York City.', ' Bordered by 58th Street, 59th Street, Broadway, and Eighth Avenue, it stands on the site of the seven-story Grand Circle Hotel designed by William H. Cauvet.', ' Opened in 1964 after A&P heir Huntington Hartford hired architect Edward Durell Stone to build a museum for him at the site.', \" The building came under controversy in 2002 after the Museum of Arts and Design (MAD) was designated as the building's developer.\", ' MAD subsequently significantly altered its design, including modifying its facade; since 1996, ideas had been put forward for the building to be landmarked, so its proposed landmark status was brought into question with this renovation.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 18 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 74%|███████▎ | 368/500 [03:23<06:15, 2.85s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:24:23.657\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 24 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:24:23.708\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a8789115542994846c1cd9a', 'answer': 'Pollywood', 'question': 'Imran Khan has worked in what type of films refering to the Pashto Language film industry?', 'supporting_facts': [['Imran Khan (Pakistani actor)', 0], ['Pashto cinema', 0]], 'context': [['Khan Abdul Ghani Khan', ['Ghani Khan (Pashto: غني خان) \\u200e (1914–1996) was a Pakistani Pashto language poet, artist, writer, politician and Philosopher of the 20th century.', ' He was a son of Khan Abdul Ghaffar Khan and older brother of Khan Abdul Wali Khan.']], ['Imran Khan (Pakistani actor)', ['Imran Khan (better known as just Imran) is a Pakistani film actor who has worked in Lollywood and Pollywood films.']], ['Imran Khan (Indian actor)', ['Imran Khan (] ; born Imran Pal 13 January 1983) is an American-born film actor, who appears in Hindi films.', ' He is the nephew of actor Aamir Khan and director-producer Mansoor Khan, and the grandson of director-producer Nasir Hussain.', ' He appeared as a child artist in the films \"Qayamat Se Qayamat Tak\" (1988) and \"Jo Jeeta Wohi Sikander\" (1992).']], ['Laaj', ['Laaj (Urdu: \\u200e ) is a 2003 Pakistani Urdu language film which was directed by Rauf Khalid.', ' The film starred Zara Sheikh and Imran Khan in its lead roles.', \" Film's music is composed by Amjad Bobby.\"]], ['Mohammad Imran Pratapgarhi', ['Mohammad Imran Pratapgarhi Urdu: محمّد عمران خان\\u200e Hindi: इमरान प्रतापगढ़ी originally known as Mohammad Imran Khan is a famed Urdu language and Hindi language Poet who has gained prominence among the audience through his revolutionary poems.', ' The three times National Award Winner for debate and poetry, he has a firm belief in following his heart.', ' His work has a dominance in framing verses for sharp socio-political distortions, country- love, brotherhood and religious - social harmony fragrance broke.']], ['Cinema of Bangladesh', ['The cinema of Bangladesh is the Bengali language film industry based in Dhaka, Bangladesh.', ' It has often been a significant film industry since the early 1970s and is frequently referred to as \"Dhallywood\" (Bengali: ঢালিউড ), which is a portmanteau of the words Dhaka and Hollywood.', ' The dominant style of Bangladeshi cinema is melodramatic cinema, which developed from 1947 to 1990 and characterizes most films to this day.', ' Cinema was introduced in Bangladesh in 1898 by Bradford Bioscope Company, credited to have arranged the first film release in Bangladesh.', ' Between 1913 and 1914, the first production company named Picture House was opened.', ' A short silent film titled \"Sukumari\" (\"The Good Girl\") was the first produced film in the region during 1928.', ' The first full-length film \"The Last Kiss\", was released in 1931.', ' From the separation of Bangladesh from Pakistan, Dhaka is the center of Bangladeshi film industry, and generated the majority share of revenue, production and audiences. \"', 'The Face and the Mask\", the first Bengali language Bangladeshi full-length feature film was produced in 1956.', ' The 1960s, 1970s, 1980s and the first half of the 1990s were the golden years for Bangladeshi films as the industry produced many successful films.', ' But during then many of the films were unofficial remake of Indian films.']], ['Jaane Tu... Ya Jaane Na', ['Jaane Tu... Ya Jaane Na (translation: \"Whether you know... or not\") is a 2008 Indian coming of age romantic drama film, written and directed by Abbas Tyrewala.', \" The film stars Imran Khan and Genelia D'Souza in pivotal roles.\", \" Produced by Mansoor Khan, Aamir Khan, it marks the directional debut of Abbas Tyrewala, the debut of Imran Khan (Aamir Khan's nephew) and Prateik Babbar as actors, and the re-appearance of D'Souza in Hindi cinema.\", ' Released on 4 July 2008, the film received positive reviews, and was successful at the box office.', ' The music is by A. R. Rahman.']], ['57th Filmfare Awards', ['The 57th Filmfare Awards were held on January 29, 2012 at Film City, Mumbai honoring the best film of 2011 from the Hindi-language film industry (commonly known as Bollywood).', ' The ceremony was jointly hosted by Shahrukh Khan and Ranbir Kapoor.', ' Incidentally, both of them have hosted the award ceremonies previously but with different co-hosts (Khan with Saif Ali Khan, Kapoor with Imran Khan), hence making it the first time for this pair to host the show.']], ['Cinema of Pakistan', ['The Cinema of Pakistan or Pakistani cinema (Urdu: \\u200e ) refers to the filmmaking industry in Pakistan.', ' Pakistan is home to several film studios centres, primarily located in its two largest cities - Karachi and Lahore.', ' Pakistani cinema has played an important part in Pakistani culture, and in recent years has begun flourishing again after years of decline, delivering entertainment to audiences in Pakistan and expatriates abroad.', ' Several film industries are based in Pakistan, which tend to be regional and niche in nature.', ' Over 10,000 Urdu feature-films have been produced in Pakistan since 1948, as well as over 8000 Punjabi, 6000 Pashto and 2000 Sindhi feature-length films.', ' The first film ever produced was \"Husn Ka Daku\" in 1930, directed by Abdur Rashid Kardar in Lahore.', ' The first Pakistani-film produced was \"Teri Yaad\", directed by Daud Chand in 1948.', \" Between 1947 and 2007, Pakistani cinema was based in Lahore, home to the nation's largest film industry (nicknamed Lollywood).\", ' Pakistani films during this period attracted large audiences and had a strong cult following, was part of the cultural mainstream, widely available and imitated by the masses.', \" During the early 1970s, Pakistan was the world's fourth largest producer of feature films.\", ' However, between 1977 and 2007, the film industry of Pakistan went into decline due to Islamization, strengthening of censorship laws and an overall lack of quality.', ' Throughout the 1980s and 1990s, the film industry went through several periods of ups and downs, a reflection of its dependency on state funding and incentives.', ' By 2000, the film industry in Lahore had collapsed and saw a gradual shift of Pakistani actors, actresses, producers and filmmakers from Lahore to Karachi.', \" By 2007, the wounds of Pakistan's collapsed film industry began to heal and Karachi had cemented itself as the centre of Pakistani cinema.\", ' Quality and new technology led to an explosion of alternative form of Pakistani cinema.', ' The shift has been seen by many as the leading cause for the \"resurgence of Pakistani cinema\".', ' Despite the industry crisis starting in the mid-1980s, Pakistani films have retained much of its distinctive identity.', ' Since the shift to Karachi, Pakistani films have once again began attracting a strong cult following.']], ['Pashto cinema', ['Pashto cinema (Urdu: \\u200e , Pashto: د پښتو سينما\\u200e ), also known by its sobriquet Pollywood (Pashto: پالېوډ\\u200e ), refers to the Pashto language film industry of Pakistani cinema based in Peshawar, Khyber Pakhtunkhwa, Pakistan.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 17 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 74%|███████▍ | 370/500 [03:23<04:16, 1.97s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:24:23.769\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a7b1c6b55429931da12c9ca', 'answer': 'John Douglas \"Johnny\" Edwards', 'question': 'Johnny Edwards and Ian Anderson are singers, who had joined the greater number of bands?', 'supporting_facts': [['Johnny Edwards (musician)', 0], ['Ian Anderson', 0]], 'context': [['Belize National Youth Chess Foundation', ['The Belize National Youth Chess Foundation (B.N.Y.C.F.) was co-founded by Ian & Ella Anderson in the summer of 2007 as a not-for-profit organization and with a small army of volunteers it spread throughout the country.', ' The game of Chess has been around for a very long time but in Belize there were no formal organizations and no figures to indicate how many people were playing the game.', ' Building on the founding by Mr. Robert Landolfi and Mr. Glen Reneau of the first school chess club at Hummingird Elementary and the Belize Association of Chess Players in Belize City, due to the efforts of the B.N.Y.C.F. there are now teams ranging from the most southern villages in Toledo District to the most northern villages along the Belize-Mexico border in the Corozal District.', ' Since 2007, the organization has more than 50 active chess clubs and over 1400 players around the country and it functions all year round.', ' An interview with Ian Anderson, Co-Chair, reveals that chess is not only a pastime or hobby in Belize; it can and should be used “as an educational tool to help develop the minds of primary school students.”', ' The B.N.Y.C.F. has worked with primary schools to successfully integrate chess as a part of the curriculum of the primary schools in Belize.', ' Within one year the game of Chess became the fastest growing sport in the country.', ' As part of its efforts to promote this sport, the B.N.Y.C.F. assisted the Belize Chess Federation to become active again in 2008 by updating fees due to FIDE, the World Chess affiliate.']], ['Dead to Me', ['Dead to Me is a punk rock band from San Francisco, founded by vocalist/guitarist Jack Dalrymple, drummer Brandon Pollack (both from the band One Man Army) and bassist/vocalist Chicken of Western Addiction.', \" Early on, Pollack was replaced by Chicken's cousin Ian Anderson on drums and Nathan Grice joined as a second guitarist.\"]], ['Mike Vickers', ['Michael \"Mike\" Vickers (born 18 April 1940) is a British musician who came to prominence as guitarist, flautist and saxophonist with the 1960s band, Manfred Mann.', ' He was born in Southampton, Hampshire, England.', ' He originally played flute and saxophone but with the increasing popularity of guitars in bands it was decided that Manfred Mann should have a guitarist in its line-up.', ' Vickers volunteered for this role but he was always happiest playing woodwind.', ' His tough flute soloing on hard blues tracks such as \"Without You\" prefigured the work of Ian Anderson with Jethro Tull five years later.', ' As the group were all multi-instrumentalists who delighted in instrumental solos, multi-tracking was used to allow Vickers to perform on guitar and woodwind on the same recordings, while drummer Mike Hugg similarly doubled on vibraphone.']], ['King Kobra III', ['King Kobra III, released in 1988 on New Renaissance Records, was the first and last album by the Edwards, Michael-Phillips, Northrup, Hart and Appice line-up of King Kobra.', ' After the demise of the original line-up, remaining members Carmine Appice and David Michael-Phillips teamed up with Johnny Edwards, Jeff Northrup and Larry Hart, all 3 members of the Sacramento, CA band Northrup at the time.']], ['Gerald Bostock', ['Gerald Bostock is a fictional character originally created by Ian Anderson for his band Jethro Tull\\'s 1972 concept album, \"Thick as a Brick\"; Bostock is credited with writing the lyrics to the album (though Anderson in fact authored them himself).', ' Bostock is also the focus of Anderson\\'s 2012 solo album, \"Thick as a Brick 2: Whatever Happened to Gerald Bostock?', '\", as well as the purported lyricist for Anderson\\'s 2014 solo album \"Homo Erraticus\".']], ['Wild Horses (US rock band)', ['Wild Horses was a band that originally featured former Buster Brown and Montrose members Johnny Edwards and James Kottak.', \" The band also featured James Kottak's former Kingdom Come bandmate Rick Steier.\", ' The band went through at least two bassists: Chris Lester and Jeff Pilson.']], ['Afternoon Records', ['Afternoon Records is a record label based in Minneapolis, Minnesota.', \" The label was founded by Ian Anderson and Michael M. Sandstedt in 2003, the year of Ian's graduation from high school.\", ' Ian wanted to create a platform for his high school band \"Aneuretical\", and others.']], ['Johnny Edwards (musician)', ['John Douglas \"Johnny\" Edwards is an American rock singer who sang for the bands Buster Brown, Montrose, King Kobra, Wild Horses, Northrup, Royal Jelly and is best known as the second lead singer of the rock band Foreigner.']], ['Unusual Heat', ['Unusual Heat is the seventh studio album by British-American rock band Foreigner, released on 14 June 1991 by Atlantic Records.', ' Recorded at several different studios across the state of New York and England, and produced by Terry Thomas and Mick Jones, it was the only album with lead singer Johnny Edwards.', ' He replaced original lead singer Lou Gramm after the latter had parted company in 1990.', ' \"Unusual Heat\" was the last album to feature bass guitarist Rick Wills, who joined the band in 1979, and drummer Dennis Elliott, who was a founding member.']], ['Ian Anderson', ['Ian Scott Anderson, MBE (born 10 August 1947) is a Scottish-born musician, singer, songwriter and multi-instrumentalist best known for his work as the lead vocalist, flautist and acoustic guitarist of British rock band Jethro Tull.', ' Anderson plays several other musical instruments, including keyboards, bass guitar, bouzouki, balalaika, saxophone, harmonica, and a variety of whistles. His solo work began with the 1983 album \"Walk into Light\", and since then he released another five works, including the sequel to the Jethro Tull album \"Thick as a Brick\" (1972) in 2012, entitled \"Thick as a Brick 2\".']]], 'type': 'comparison', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 17 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708062.655484272)])']\n", "connector: \n", "Evaluating workflow: 75%|███████▍ | 373/500 [03:23<02:39, 1.26s/it]Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708063.959888356)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:24:24.301\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a8a1dbc55429970aeb7025a', 'answer': 'Sulla', 'question': 'Lex Antonia nullified the dictatorial laws set up by which Roman general?', 'supporting_facts': [['Lex Antonia', 2], ['Lex Antonia', 3], ['Sulla', 0]], 'context': [['Sulla', ['Lucius Cornelius Sulla Felix ( ; c. 138 BC – 78 BC), known commonly as Sulla, was a Roman general and statesman.', ' He had the distinction of holding the office of consul twice, as well as reviving the dictatorship.', ' Sulla was a skillful general, achieving numerous successes in wars against different opponents, both foreign and Roman.', ' He was awarded a grass crown, the most prestigious Roman military honor, during the Social War.']], ['Lex specialis', ['Lex specialis, in legal theory and practice, is a doctrine relating to the interpretation of laws and can apply in both domestic and international law contexts.', ' The doctrine states that if two laws govern the same factual situation, a law governing a specific subject matter (\"lex specialis\") overrides a law governing only general matters (\"lex generalis\").', ' The situation ordinarily arises with regard to the construction of earlier-enacted specific legislation when more general legislation is later passed.', ' However, then, the doctrine called \"lex posterior derogat legi priori\" may also apply, the younger law overriding the older law.']], ['Iullus Antonius', ['Iullus Antonius (45 BC – 2 BC), also known as Iulus, Julus or Jullus, was a personage in Ancient Rome.', \" He was the second son of Roman general Mark Antony and Antony's third wife Fulvia.\", ' He is best known for being the famous lover of Julia the Elder.', \" He was the full brother of Marcus Antonius Antyllus, half-brother of Clodia Pulchra (the first wife of Augustus) through his mother's first marriage, half-brother of Antonia Major and Antonia Minor through his father's marriage to Octavia Minor, and half-brother of Alexander Helios, Cleopatra Selene\\xa0II and Ptolemy Philadelphus through his father's marriage to Cleopatra\\xa0VII.\", ' His stepsiblings were Marcellus, Claudia Marcella Major (later his wife), Caesarion and Claudia Marcella Minor.', ' He was also stepson to Octavia Minor (sister of Augustus) and Cleopatra\\xa0VII.']], ['Lex Junia Norbana', ['In Roman Law, Lex Iunia Norbana of 19 AD classified all freedmen into two classes according to their mode of enfranchisement: enfranchised citizens, (freedmen who enjoyed Roman citizenship) and enfranchised Latini (freedmen who had only Latin rights).', ' Braund, D., Augustus to Nero (Routledge Revivals): A Sourcebook on Roman History, 31 BC-AD 68 (2015), [710] Freedmen would be granted only Latin rights if the manumission of the slave failed to meet any of the conditions set out by the lex Aelia Sextia of 4 AD for it to confer Roman citizenship.', ' This provided that for the freedman to acquire Roman citizenship a slave had to be manumitted at the age of 30 or older, the owner had to have quiritary ownership and the ceremony had to be public.', ' For slaves under the age of thirty, the manumission had to be approved by a special council.', ' The manumission of slaved who had been enslaved because of crimes would raise them only to the position of dedititii (war captives).', ' ^Thus, the Lex Iunia Norbana made the slaves who were not eligible for Roman citizens as per the lex Aelia Sextia enfranchised Latins.', ' The law retained the dedititii.', ' A clause of the law \"took away from these Latini Juniani, as they were called, the capacity of making a testament, taking under a testament, and being appointed tutores by a testament.\"']], ['Joe Laws', ['Joe Roy Laws (June 16, 1911 – August 22, 1979) was an American football player.', ' He played his entire career with the Green Bay Packers, winning three World Championships, and was inducted into the Green Bay Packers Hall of Fame in 1972.', ' Prior to joining the Packers, Laws attended the University of Iowa where he was a member of Sigma Pi fraternity.', ' While at Iowa he was named All-Big Ten quarterback and the Big Ten Most Valuable Player in 1933.', \" On December 17, 1944 Joe Laws set an NFL postseason record (since broken), by intercepting 3 passes in the Packers' 14-7 victory over the Giants in the league title game.\"]], ['Lex Irnitana', ['The lex Irnitana is a collection of six bronze tablets containing fragments of Roman municipal laws found in 1981 near El Saucejo, Spain.', ' Together with the \"Lex Salpensana\" and the \"Lex Malacitana\" they provide the most complete version of the \"lex Flavia municipalis\", Flavian municipal law.', ' and have allowed new insights into the workings of Roman law.', ' The tablets are exhibited in the Archeological Museum of Seville.', ' Since the tablets provide the only surviving copy of large parts of the Flavian municipal law, it has provided new insights into the procedural side of municipal courts.']], ['Lex Antonia de Termessibus', ['The Lex Antonia de Termessibus was a Roman law passed in 71 or 68 BC, at the initiative of the tribune Gaius Antonius.']], ['Aghbugha I Jaqeli', ['Aghbugha I Jaqeli (Georgian: აღბუღა I ჯაყელი ) (died 1395) was a Georgian prince (\"mtavari\") and Atabeg of Samtskhe from 1389 to 1395.', ' Aghbugha was a Son of Prince Shalva.', \" After his father's death Aghbugha was appointed as co-ruler (he ruled with his uncle Beka I) of Meskheti by Georgian king Bagrat V.\", ' During 1381-1386 he renewed The book of laws which was established by his Great-great-grandfather, Beka Jaqeli.', ' This book firstly was called \"Aghbugha\\'s law\", then \"Book of laws set by Beka-Aghbugha\".']], ['Lex Antonia', ['Lex Antonia (Latin for \"Antonine law\", sometimes presented plurally as the leges Antoniae, \"Antonine laws\") was a law established in ancient Rome in April 44 BC.', ' It was proposed by Mark Antony and passed by the Roman Senate, following the assassination of Julius Caesar.', ' It formally abolished the Dictatorship.', ' It was the second law to do so (the first being passed after the Second Punic War, replacing the Dictatorship with the final decree of the Senate); however, the earlier law had essentially been nullified by the subsequent Dictatorships of Sulla and Caesar.']], ['Lex Burgundionum', ['The Lex Burgundionum (Latin for Burgundian Laws, also \"Lex Gundobada\") refers to the law code of the Burgundians, probably issued by king Gundobad.', ' It is influenced by Roman law and deals with domestic laws concerning marriage and inheritance as well as regulating weregild and other penalties.', ' Interaction between Burgundians is treated separately from interaction between Burgundians and Gallo-Romans.', \" The oldest of the 14 surviving manuscripts of the text dates to the 9th century, but the code's institution is ascribed to king Gundobad (died 516), with a possible revision by his successor Sigismund (died 523).\", ' The \"Lex Romana Burgundionum\" is a separate code, containing various laws taken from Roman sources, probably intended to apply to the Burgundians\\' Gallo-Roman subjects.', ' The oldest copy of this text dates to the 7th century.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 17 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9708064.65746456)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708062.589522608)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708064.436683124)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708063.93616856)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708063.719492847)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708064.144719047)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708064.38446186)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708064.256820524)])']\n", "connector: \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:24:24.330\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a7a33205542996a35c1712f', 'answer': 'Province of New York', 'question': 'Out of two American colonies that had a series of skirmishes and raids between 1701 and 1765 at the disputed border, which British proprietary colony became a royal colony on the northeast coast of North America?', 'supporting_facts': [['New York – New Jersey Line War', 0], ['Province of New York', 0]], 'context': [['History of the New Jersey State Constitution', ['Originally, the state of New Jersey was a single British colony, the Province of New Jersey.', ' After the English Civil War, Charles II assigned New Jersey as a proprietary colony to be held jointly by Sir George Carteret and John Berkeley, 1st Baron Berkeley of Stratton.', ' Eventually, the collection of land fees, or quit-rents, from colonists proved inadequate for colonial profitability.', ' Sir George Carteret sold his share of the colony to the Quakers in 1673.', ' Following the sale, the land was divided into East and West Jersey.', ' In 1681, West Jersey adopted a constitution.', ' In 1683, East Jersey adopted one as well.', ' In 1702, the colonies were united again under Anne, Queen of Great Britain, and adopted a constitution in 1776.']], ['Province of Pennsylvania', ['The Province of Pennsylvania, also known as the Pennsylvania Colony, was founded in English North America by William Penn on March 4, 1681 as dictated in a royal charter granted by King Charles II.', ' The name Pennsylvania, which translates roughly as \"Penn\\'s Woods\", was created by combining the Penn surname (in honor of William\\'s father, Admiral Sir William Penn) with the Latin word \"sylvania\", meaning \"forest land.\"', ' The Province of Pennsylvania was one of the two major Restoration colonies, the other being the Province of Carolina.', \" The proprietary colony's charter remained in the hands of the Penn family until the American Revolution, when the Commonwealth of Pennsylvania was created and became one of the original thirteen states.\"]], ['Province of New York', ['The Province of New York (1664–1776) was a British proprietary colony and later royal colony on the northeast coast of North America.', ' As one of the Thirteen Colonies, New York achieved independence and worked with the others to found the United States.']], ['Charter colony', ['Charter colony is one of three classes of colonial government established in the 17th century English colonies in North America, the other classes being proprietary colony and royal colony.', ' The colonies of Rhode Island, Connecticut, and Massachusetts Bay were charter colonies.', ' In a charter colony, Britain granted a charter to the colonial government establishing the rules under which the colony was to be governed.', ' The charters of Rhode Island and Connecticut granted the colonists significantly more political liberty than other colonies.', ' Rhode Island and Connecticut continued to use their colonial charters as their State constitutions after the American Revolution.']], ['Canada under British rule', ['Canada first came under British rule with the Treaty of Paris (1763) which ceded New France, of which Canada was a part, to the British Empire.', ' Gradually, other territories, colonies, and provinces that were part of British North America would be added to Canada.', ' The Royal Proclamation of 1763 enlarged the colony of Canada under the name of the Province of Quebec, which with the Constitutional Act 1791 became known as The Canadas.', ' With the Act of Union 1840 Upper and Lower Canada were joined to become the United Province of Canada.', ' Later, with Confederation in 1867, the British maritime colonies of New Brunswick and Nova Scotia were joined with the British colony of Canada to form the Dominion of Canada, which was subsequently divided into four provinces, Ontario, Quebec, New Brunswick, and Nova Scotia.', \" A number of other British colonies, such as Newfoundland and British Columbia, and large territories such as Rupert's Land initially remained outside of the newly formed federation.\", ' Over time, the remaining colonies and territories within British North America came under the control of Canada until the current geographic extent of the country was reached when Newfoundland and Labrador joined Canada in 1949.', ' Although confederation in 1867 led to an enlarged Dominion with increased autonomy over domestic affairs, Canada still remained a colony within the British Empire and was thus subordinate to the British Parliament until the enactment of the Statute of Westminster in 1931.', ' This statute recognized Canada as an independent peer coequal with the United Kingdom, and thus provided the Parliament of Canada with legislative sovereignty over all federal matters except the power to change the constitutional laws of Canada which remained under the purview of the Parliament of the United Kingdom.', \" Canada's final vestige of legal dependence on the United Kingdom was terminated in 1982 with the enactment of the Canada Act, subsequently providing Canada with full legal sovereignty completely independent of the United Kingdom.\"]], ['Stamp Act Congress', ['The Stamp Act Congress or First Congress of the American Colonies was a meeting held between October 7 and 25, 1765 in New York City, consisting of representatives from some of the British colonies in North America; it was the first gathering of elected representatives from several of the American colonies to devise a unified protest against new British taxation.', ' Parliament had passed the Stamp Act, which required the use of specially stamped paper for legal documents, playing cards, calendars, newspapers and dice for virtually all business in the colonies, and was going into effect on November 1.']], ['Province of New Jersey', ['The Province of New Jersey was one of the Middle Colonies of Colonial America and became the U.S. state of New Jersey in 1776.', ' The province had originally been settled by Europeans as part of New Netherland, but came under English rule after the surrender of Fort Amsterdam in 1664, becoming a proprietary colony.', ' The English then renamed the province after the Isle of Jersey in the English Channel.', ' The Dutch Republic reasserted control for a brief period in 1673–1674.', ' After that it consisted of two political divisions, East Jersey and West Jersey, until they were united as a royal colony in 1702.', ' The original boundaries of the province were slightly larger than the current state, extending into a part of the present state of New York, until the border was finalized in 1773.']], ['New York – New Jersey Line War', ['The New York – New Jersey Line War (also known as the N.J. Line War) refers to a series of skirmishes and raids that took place for over half a century between 1701 and 1765 at the disputed border between two American colonies, the Province of New York and the Province of New Jersey.']], ['Proprietary colony', ['A proprietary colony was a type of British colony mostly in North America and the Caribbean in the 17th century.', ' In the British Empire, all land belonged to the ruler, and it was his prerogative to divide.', ' Therefore, all colonial properties were partitioned by royal charter into one of four types: proprietary, royal, joint stock, or covenant.', ' King Charles II used the proprietary solution to reward allies and focus his own attention on Britain itself.', ' He offered his friends colonial charters which facilitated private investment and colonial self-government.', ' The charters made the proprietor the effective ruler, albeit one ultimately responsible to English law and the king.', ' Charles II gave New Netherland to his younger brother The Duke of York, who named it New York.', ' He gave an area to William Penn who named it Pennsylvania.']], ['Stamp Act 1765', ['The Stamp Act of 1765 (short title \"Duties in American Colonies Act 1765\"; 5 George III, c. 12) was an Act of the Parliament of Great Britain that imposed a direct tax on the colonies of British America and required that many printed materials in the colonies be produced on stamped paper produced in London, carrying an embossed revenue stamp.', ' Printed materials included legal documents, magazines, playing cards, newspapers, and many other types of paper used throughout the colonies.', ' Like previous taxes, the stamp tax had to be paid in valid British currency, not in colonial paper money.', \" The purpose of the tax was to help pay for troops stationed in North America after the British victory in the Seven Years' War and its North American theater of the French and Indian War.\", ' The Americans said that there was no military need for the soldiers because there were no foreign enemies on the continent, and the Americans had always protected themselves against Indians.', ' They suggested that it was actually a matter of British patronage to surplus British officers and career soldiers who should be paid by London.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 17 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9708065.23281698)])']\n", "connector: \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:24:24.332\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5ae4eb9255429908b63264ba', 'answer': 'Mohawk, Onondaga, Oneida, Cayuga, Seneca, and Tuscarora', 'question': 'What were the nations after 1722 that composed a culture that was extensively studied by William N. Fenton?', 'supporting_facts': [['William N. Fenton', 0], ['Iroquois', 1]], 'context': [['Bimal N. Patel', ['Bimal N. Patel is a Professor of Public International Law and the current Director of the Gujarat National Law University, Gandhinagar.', ' He was appointed by a High Level Committee headed by the then Chief Justice of India, K G Balakrishnan, at the Supreme Court of India premises.', \" The Government of India has also recently appointed him as a member of the 21st Law Commission of India along with Justice Balbir Singh Chauhan, retired Judge of the Hon'ble Supreme Court of India as its chairperson.\", ' Prof. Patel is a former International civil servant, scholar and academician of international law and diplomacy.An acclaimed international law jurist, he has extensively studied, researched, commented and published works on the administrative, procedural and substantive jurisprudence of the International Court of Justice (ICJ), International Tribunal for the Law of the Sea (ITLOS), International Criminal Tribunal for former Yugoslavia (ICTY) and International Labour Organisation Administrative Tribunal (Geneva).', ' His publications on India and International Law and Responsibility of International Organisations are reviewed and referred by international law scholars and journals across the world.', ' He has published, edited several books, research papers/articles/surveys in leading academic and international law journals.', ' He has been involved in drafting several national and state primary and secondary legislations, regulations, rules and holds the distinction as one of the first Indians to serve at the International Labour Organization Administrative Tribunal (Geneva).', ' He has delivered numerous lectures, including one at Cambridge University, UK, and has received several honours.', ' He has served at the Organisation for the Prohibition of Chemical Weapons,Hague, Netherlands.']], ['Ashanti Empire', ['The Ashanti (also spelled Asante) Empire (1701–1957) was an Akan empire and kingdom in what is now modern-day Ghana.', ' The Ashanti Empire expanded from Ashanti to include the Brong-Ahafo, Central region, Eastern region, Greater Accra region, and Western region, of present-day Ghana.', ' The Ashanti benefited from early firearm adoption.', ' Combined with effective strategy, they fashioned an empire that stretched from central Ghana to the present-day Ivory Coast.', \" Due to the empire's military prowess, wealth, architecture, sophisticated hierarchy and culture, Ashanti has been extensively studied and has more historiographies by European, primarily British, authors than almost any other indigenous culture of Sub-Saharan Africa.\"]], ['William N. Fenton', ['William N. Fenton (December 15, 1908 – June 17, 2005) was an American scholar and writer known for his extensive studies of Iroquois history and culture.', ' He started his studies of the Iroquois in the 1930s and published a number of significant works over the following decades.', ' His final work was published in 2002.', ' During his career, Fenton was director of the New York State Museum and a professor of anthropology at the State University of New York.']], ['William N. Rhodes', ['William N. Rhodes was an American airforce Technical Sergeant in World War II.', ' On March 31, 1945, TSgt.', \" William N. Rhodes' aircraft was engaged in a mission to take out a primary target (oil refinery) at Ziet, Germany.\", ' During that engagement his B-17 aircraft was hit by enemy fire.', ' The number three engine oil supply line was cut by flak, and the landing gear was hit and jammed.', ' Flak also damaged an engine housing causing that prop to be shut down and feathered.', ' The Aircraft was able to maintain an altitude of 17,500 feet and began its journey back to England when two jet propelled German fighters attacked.', ' These two German aircraft were sighted and immediately reported to the pilot by TSgt.', ' Rhodes.', \" The B-17 was hit during the German fighter attack inflicting extensive damage to the aircraft's number three fuel tank, causing it to explode and tossed the right wing violently.\", ' Following this hit the aircraft went into a tight downward spin, within just a few thousand feet the tail section of the aircraft blew off causing the aircraft to level off slightly and continue falling in a shallow spin.', ' This presented the opportunity for the Navigator, Turret Gunner, Co-Pilot and TSgt.', ' Rhodes to bail out.', ' TSgt.', ' Rhodes and three other crew members landed near Biberach, Germany.', ' Of the nine original crew members on the B-17, only four survived the aerial encounter.', ' Upon landing, TSgt.', ' Rhodes and the other survivor’s were captured by German troops waiting on the ground, searched and taken to a Luftwaffe camp where they were processed as Prisoners of war.']], ['Iroquois', ['The Iroquois ( or ) or Haudenosaunee ( ) are a historically powerful northeast Native American confederacy.', ' They were known during the colonial years to the French as the \"Iroquois League,\" and later as the \"Iroquois Confederacy,\" and to the English as the \"Five Nations\" (before 1722), and later as the \"Six Nations,\" comprising the Mohawk, Onondaga, Oneida, Cayuga, Seneca, and Tuscarora peoples.']], ['Near polygon', ['In mathematics, a near polygon is an incidence geometry introduced by Ernest E. Shult and Arthur Yanushka in 1980.', ' Shult and Yanushka showed the connection between the so-called tetrahedrally closed line-systems in Euclidean spaces and a class of point-line geometries which they called near polygons.', ' These structures generalise the notion of generalized polygon as every generalized 2\"n\"-gon is a near 2\"n\"-gon of a particular kind.', ' Near polygons were extensively studied and connection between them and dual polar spaces was shown in 1980s and early 1990s.', ' Some sporadic simple groups, for example the Hall-Janko group and the Mathieu groups, act as automorphism groups of near polygons.']], ['Candicine', ['Candicine is a naturally occurring organic compound that is a quaternary ammonium salt with a phenethylamine skeleton.', ' It is the N,N,N-trimethyl derivative of the well-known biogenic amine tyramine, and, being a natural product with a positively charged nitrogen atom in its molecular structure, it is classed as an alkaloid.', ' Although it is found in a variety of plants, including barley, its properties have not been extensively studied with modern techniques.', ' Candicine is toxic after parenteral administration, producing symptoms of neuromuscular blockade; further details are given in the \"Pharmacology\" section below.']], ['Kunda culture', ['Kunda Culture, originating from the Swiderian culture, comprised mesolithic hunter-gatherer communities of the Baltic forest zone extending eastwards through Latvia into northern Russia, dating to the period 8500–5000 BC according calibrated radiocarbon dating.', ' It is named after the Estonian town of Kunda, about 110 km east of Tallinn along the Gulf of Finland, near where the first extensively studied settlement was discovered on Lammasmäe Hill and in the surrounding peat bog.', ' The oldest known Kunda culture settlement in Estonia is Pulli.', ' The Kunda Culture was succeeded by the Narva culture, who used pottery and showed some traces of food production.']], ['Oliver Phase', ['The Oliver Phase is the name for a Late Woodland Native American culture that flourished from 1200 and 1450 CE along the east and west forks of the White River in central and southern Indiana.', ' The Oliver Phase is of the Western Basin Tradition which includes the Springwells Phase, the Younge Phase, and the Riviere au Vase Phase.', ' Oliver people were village dwelling farmers with a heavy reliance on maize, very similar to other Late Woodland peoples in the area the Oneota, Fort Ancient, and Monongahela cultures.', ' The name was originally coined by archaeologist James B. Griffin in 1946 to describe a Late Woodland ceramic complex centered in Hamilton and Marion counties in the valley of the West Fork of the White River first extensively studied at the Bowen site.']], ['Nematostella', ['Nematostella is a genus of sea anemones in the family Edwardsiidae.', ' There are three species.', ' The best known is the starlet sea anemone (\"N. vectensis\"), which has been extensively studied as a model organism in fields such as genetics, evolution, and ecology.', ' The defining morphological apomorphy of \"Nematostella\" is the presence of nematosomes.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 17 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708135.046465144)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708135.94805052)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708136.279565085)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708135.16420712)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708134.735819455)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708136.913987298)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708135.375286708)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708134.91655394)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708135.201073507)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708135.176103158)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708134.827366596)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708135.956761103)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708134.998871293)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708135.274414623)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708135.680830916)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708134.630895024)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708135.185286311)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:24:24.333\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a77e70f5542992a6e59dfeb', 'answer': 'The Tempest', 'question': 'What is the title of the 1979 film adaptation of William Shakespeare\\'s play in which the English poet, actor, political activist and dramatist who wrote wrote a number of book-length polemical poems such as \"Autogeddon\", \"Falling for a Dolphin\" and \"Whale Nation\" played a main character?', 'supporting_facts': [['The Tempest (1979 film)', 0], ['The Tempest (1979 film)', 1], ['Heathcote Williams', 0], ['Heathcote Williams', 1]], 'context': [['The Tempest (1979 film)', [\"The Tempest is a 1979 film adaptation of William Shakespeare's play of the same name.\", ' Directed by Derek Jarman, with Heathcote Williams as Prospero, it also stars Toyah Willcox, Jack Birkett and Helen Wellington-Lloyd from Jarman\\'s previous feature, \"Jubilee\" (1977), as well as his long-time cohort Karl Johnson.']], ['William Shakespeare', [\"William Shakespeare ( ; 26 April 1564 (baptised)\\xa0– 23 April 1616) was an English poet, playwright, and actor, widely regarded as the greatest writer in the English language and the world's pre-eminent dramatist.\", ' He is often called England\\'s national poet, and the \"Bard of Avon\".', ' His extant works, including collaborations, consist of approximately 38 plays, 154 sonnets, two long narrative poems, and a few other verses, some of uncertain authorship.', ' His plays have been translated into every major living language and are performed more often than those of any other playwright.']], ['Henry Carey (writer)', ['Henry Carey (c. 26 August 1687 – 5 October 1743) was an English poet, dramatist and song-writer.', ' He is remembered as an anti-Walpolean satirist and also as a patriot.', ' Several of his melodies continue to be sung today, and he was widely praised in the generation after his death.', ' Because he worked in anonymity, selling his own compositions to others to pass off as their own, contemporary scholarship can only be certain of some of his poetry, and a great deal of the music he composed was written for theatrical incidental music.', ' However, under his own name and hand, he was a prolific song writer and balladeer, and he wrote the lyrics for almost all of these songs.', ' Further, he wrote numerous operas and plays.', ' His life is illustrative of the professional author in the early 18th century.', ' Without inheritance or title or governmental position, he wrote for all of the remunerative venues, and yet he also kept his own political point of view and was able to score significant points against the ministry of the day.', ' Further, he was one of the leading lights of the new \"Patriotic\" movement in drama.']], ['Holy Sonnets', ['The Holy Sonnets—also known as the Divine Meditations or Divine Sonnets—are a series of nineteen poems by the English poet John Donne (1572–1631).', \" The sonnets were first published in 1633—two years after Donne's death.\", ' The poems are sonnets and are predominantly in the style and form prescribed by Renaissance Italian poet Petrarch (or Francesco Petrarca) (1304–1374) in which the sonnet consisted of two quatrains (four-line stanzas) and a sestet (a six-line stanza).', ' However, several rhythmic and structural patterns as well as the inclusion of couplets are elements influenced by the sonnet form developed by English poet and playwright William Shakespeare (1564–1616).']], ['Samuel Taylor Coleridge', ['Samuel Taylor Coleridge ( ; 21 October 177225 July 1834) was an English poet, literary critic, philosopher and theologian who, with his friend William Wordsworth, was a founder of the Romantic Movement in England and a member of the Lake Poets.', ' He wrote the poems \"The Rime of the Ancient Mariner\" and \"Kubla Khan\", as well as the major prose work \"Biographia Literaria\".', ' His critical work, especially on William Shakespeare, was highly influential, and he helped introduce German idealist philosophy to English-speaking culture.', ' Coleridge coined many familiar words and phrases, including suspension of disbelief.', ' He was a major influence on Ralph Waldo Emerson and American transcendentalism.']], ['Isabella Cervoni', [\"Isabella Cervoni (Colle Val d'Elsa, 1575–1600) was an Italian poet of the Counter-Reformation period, active between 1590 and 1600.\", \" She wrote encomiastic and polemical poems addressed to numerous secular and religious dignitaries of the Italian Renaissance, including Pope Clement VIII, Maria de' Medici, Christina of Lorraine and Henry IV of France.\", ' She was praised for her talent and ambition by Cristoforo Bronzini in his 1625 dialogue \"Della dignità delle donne, dialogo…settimana prima e giornata quarta\" as having \"given the world many beautiful and spiritual compositions\" despite her \"most tender age.\"']], ['Shakespeare bibliography', ['William Shakespeare (1564–1616) was an English poet and playwright.', ' He wrote approximately 38 plays and 154 sonnets, as well as a variety of other poems.']], ['Charles Goodall (poet)', ['Charles Goodall (1671—May 11, 1689) is a minor English poet.', ' A student of Eton College and then Merton College, Oxford, he wrote a number of romantic and erotic poems referring to male students at said colleges.', ' In 1689, the year of his death, he put together a collection entitled \"Poems and Translations\" which contains 33 poems with male-male subject matter, eleven regarding women, and 13 to a mistress named \\'Idera\\' (considered probably imaginary).', ' A number of the homoerotic poems have been rewritten to remove the same-sex subject matter.']], ['Sonnet 154', ['As the last in the famed collection of sonnets written by English poet and playwright William Shakespeare from 1592 to 1598, Sonnet 154 is most often thought of in a pair with the previous sonnet, number 153.', ' As A. L. Rowse states in \"Shakespeare\\'s Sonnets: The Problems Solved\", Sonnets 153 and 154 \"are not unsuitably placed as a kind of coda to the Dark Lady Sonnets, to which they relate.\"', ' Rowse calls attention to the fact that Sonnets 153 and 154 \"serve quite well to round off the affair Shakespeare had with Emilia, the woman characterized as the Dark Lady, and the section of the Dark Lady sonnets\".', ' Shakespeare used Greek mythology to address love and despair in relationships.', ' The material in Sonnets 153 and 154 has been shown to relate to the six-line epigram by the Byzantine poet known as Marianus Scholasticus, who published a collection of 3,500 poems called \"The Greek Anthology\".', \" When translated, the epigram resembles Sonnets 153 and 154, addressing love and the story of Cupid, the torch, and the Nymph's attempt to extinguish the torch.\"]], ['Heathcote Williams', ['John Henley Heathcote-Williams (15 November 1941 – 1 July 2017), known as Heathcote Williams, was an English poet, actor, political activist and dramatist.', ' He wrote a number of book-length polemical poems including \"Autogeddon\", \"Falling for a Dolphin\" and \"Whale Nation\", which in 1988 became, according to Philip Hoare, \"the most powerful argument for the newly instigated worldwide ban on whaling.\"', ' Williams invented his idiosyncratic \"documentary/investigative poetry\" style which he put to good purpose bringing a diverse range of environmental and political matters to public attention.', ' His last published work, \"American Porn\" was a critique of the American political establishment and the election of President Donald Trump: Publication date was the date of Trump\\'s inauguration (20 January 2017).', ' In June 2015, he published a book-length investigative poem about the \"Muslim Gandhi\", Khan Abdul Ghaffar Khan, \"Badshah Khan\".']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 17 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9708134.965267474)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:24:24.333\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5ade9545554299728e26c741', 'answer': 'Swiss made', 'question': 'What is special about the wristwatches that Favre-Leuba manufactures?', 'supporting_facts': [['Favre-Leuba', 0], ['Swiss made', 0]], 'context': [['Muramatsu Flutes', ['The Muramatsu company is a Japanese company that manufactures flutes.', ' Their handmade flutes are made from sterling silver, 9K, 14K, 18K, and 24K gold, as well as platinum.', ' The 18K, 24K, and platinum flutes may be purchased by special order only.']], ['Kimber Manufacturing', ['Kimber Manufacturing is an American company that designs, manufactures, and distributes small arms such as M1911 pistols, Solo pistols and rifles.', ' The USA Shooting Team, Marines assigned to Special Operations Command, and the LAPD SWAT team use Kimber pistols.']], ['Backlight', ['A backlight is a form of illumination used in liquid crystal displays (LCDs).', ' As LCDs do not produce light by themselves (unlike, for example cathode ray tube (CRT) displays), they need illumination (ambient light or a special light source) to produce a visible image.', ' Backlights illuminate the LCD from the side or back of the display panel, unlike frontlights, which are placed in front of the LCD.', ' Backlights are used in small displays to increase readability in low light conditions such as in wristwatches, and are used in smart phones, computer displays and LCD televisions to produce light in a manner similar to a CRT display.', ' A review of some early backlighting schemes for LCDs is given in a report \"Engineering and Technology History\" by Peter J. Wild.']], ['Favre-Leuba', ['Favre-Leuba is a Swiss manufacturer of wristwatches headquartered in Solothurn, Switzerland.', ' It was a pioneer in watch design, manufacturing and distribution, thus contributing immensely to the Swiss watchmaking industry.', ' The foundation of the brand was laid in 1737 when Abraham Favre was registered as a watchmaker, so it has been reported as the second-oldest watch brand in Switzerland.']], ['Watts Brothers Tool Works', ['Watts Brothers Tool Works is a tool manufacturer located in Wilmerding, Pennsylvania.', ' They are known for manufacturing drill bits that can drill square holes, including blind holes which cannot be made with other methods such as broaching.', ' The Harry Watts square drill bit is based on a Reuleaux triangle shape, and is used together with a guide and a special chuck to make a square hole.', ' Similarly, the company also manufactures drill bits for other angular holes such as pentagons and hexagons.']], ['Ariella Fashion House', ['Ariella is a British fashion brand of cocktail, evening and special occasion wear founded in 1966.', ' Ariella designs, manufactures, wholesales and retails women’s fashion.', ' Ariella sells under their own labels - retail label Ariella London and designer label Ariella Couture, as well as under clients’ labels.', ' In April 2015 Ariella opened its flagship store in Brent Cross Shopping Centre.']], ['Lavet type stepping motor', ['The Lavet type stepping motor has widespread use as a drive in electro-mechanical clocks and is a special kind of single-phase stepping motor.', ' Both analog and stepped-movement quartz clocks use the Lavet type stepping motor.', ' See Quartz clock.', ' Through miniaturization it can be used in wristwatches and requires very little power, making a battery last for many years.', ' The French engineer Marius Lavet is known as the inventor for this kind of drives and described it in 1936 in his patent application FR823395.']], ['Triangle Group', ['Triangle Group (also known as Triangle Tyre) is a Chinese tire company that manufactures a range of tires for vehicles from passenger cars to construction equipment and tires fit for special purposes.', ' As of 2015 it is the 14th largest tire maker in the world according to Tyres & Accessories.']], ['Carpenter Technology Corporation', ['Carpenter Technology Corporation develops, manufactures and distributes cast/wrought and powder metal stainless steels and special alloys including high temperature (iron-nickel-cobalt base), stainless, superior corrosion resistant, controlled expansion alloys, ultra-high strength and implantable alloys, tool and die steels and other specialty metals, as well as cast/wrought titanium alloys.', ' It also manufactures and rents down-hole drilling tools and components used in the oil and gas industry.']], ['Swiss made', ['Swiss made is a label used to indicate that a product was made in Switzerland.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 18 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9708136.245096168)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708137.377111712)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708137.013230087)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708136.584151508)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708138.526934477)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708139.22555916)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708213.96421166)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708213.823915036)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708213.516709032)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708213.463572929)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708213.578634253)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708213.978094896)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708213.422558844)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708213.604594536)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708213.925914455)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708213.907010758)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708214.401478088)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708213.974189317)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708213.634267274)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708215.02056764)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708215.168021532)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708214.705950284)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708216.080380108)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708216.168498788)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708216.056241084)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9708215.942308607)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708216.37195532)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708216.853211123)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708217.422062252)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708215.756755512)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708216.292566344)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708216.409672638)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708216.284062931)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708263.397638408)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708263.16252784)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708263.179416783)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708263.30292197)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708263.586913869)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708263.352674136)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708263.195824277)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708273.056452032)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708263.365613084)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708262.969542604)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708264.384750113)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708263.600556282)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708263.591632992)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708263.679261388)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708273.303079264)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708272.605474371)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708274.614556631)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708273.92685898)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708274.034968589)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708274.641266083)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708274.080722423)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708274.541462876)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708274.021738004)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708274.087973123)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708274.473526964)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708275.187182816)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708275.02251693)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708278.059565328)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708273.950053493)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708274.339741312)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708274.781682413)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708274.775700564)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708275.498703344)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708275.18232014)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708275.247100972)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708276.001122504)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708340.54817848)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708339.282089995)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708339.752982434)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708338.775112096)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708338.689148664)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708339.1191677)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708339.132532064)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708338.780472483)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708342.580639588)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9708339.340332417)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708135.005039906)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708135.253620695)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708213.817923194)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708213.553171491)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708215.492793929)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708213.91678925)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708214.075321572)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708213.895518731)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708267.507687632)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708272.680728773)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708263.088846317)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708263.13733978)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9708273.572627148)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708338.751047174)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708338.682396272)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708338.653530233)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708339.895624803)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708339.576361524)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708341.651520414)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708338.782416752)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708338.6546728)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708338.5589491)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708338.61469092)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708062.516720934)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708062.544632751)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708062.555933565)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708062.593465712)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708062.759091664)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708062.951871954)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708062.581176613)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708063.194963586)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708063.551260425)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708064.346086776)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708063.918746129)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708063.90272881)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708064.090365944)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708064.001979176)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708064.089018732)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708063.693992512)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708064.07620736)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708063.911938624)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708135.95180649)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708136.250795566)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708137.545636026)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708136.346252436)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708136.674650656)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708136.844026435)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708136.834426146)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708136.286985291)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9708137.095409906)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708137.695771849)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708137.747223727)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708137.902872326)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708139.083961908)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708137.448846253)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708137.761907453)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708137.7443813)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708137.578492912)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708137.465930896)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708139.168165812)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708138.52435722)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708145.633058311)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708138.224220578)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708138.843101729)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708138.040725572)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708214.382226463)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708214.76349435)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708214.721026031)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708214.94658769)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708214.666621715)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708214.623832315)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708216.12018294)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708215.3308074)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708215.172960091)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708215.991489245)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708216.020780593)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708218.222785903)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708216.531855462)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708215.23779304)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708218.105839292)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708216.554272741)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708216.242673095)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708262.989595007)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708273.41027286)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708272.646169083)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708272.687070696)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708274.701380083)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708273.000637252)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708273.834522665)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708274.160815671)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708273.714882769)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708339.96023706)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708340.198098972)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708339.75779233)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708339.902229909)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708341.493185269)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708341.494395014)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708340.078612028)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708340.309723739)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708339.86867332)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708340.234079435)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708340.746586328)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708340.45671624)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708340.844096545)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708340.617053391)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708341.452207332)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708341.4970748)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708341.217021016)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708341.412359752)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708341.25604762)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708341.345981557)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708341.131781358)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708341.589980025)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708341.322915982)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708341.41115284)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708342.107567651)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708341.986635108)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708343.323386673)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708342.275655456)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9708343.285185596)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708341.951250924)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708407.427294953)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708408.363457063)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708408.42610478)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708408.607600609)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708407.684009502)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708407.729440488)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708408.191401375)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708407.57208648)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708407.82304765)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708407.816396425)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708407.48128496)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708407.154752059)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708407.545418872)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708407.426129032)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708407.282229831)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708410.254722519)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708407.945091177)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708407.533742605)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708407.335709995)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708407.986593705)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708408.487061914)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708408.576427707)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708408.795004727)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708408.671234028)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708408.5324596)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708409.54596284)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708408.934395865)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708408.652039126)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708408.4952771)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708408.98887621)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708409.27912867)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708416.93307546)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708409.383746693)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708410.36994919)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708410.178555284)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708410.572280606)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708411.573964955)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708410.09534704)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708409.956628544)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708409.795180496)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708409.937666751)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708409.954870995)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708410.26099884)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708409.979822189)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708409.972871115)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708410.113043124)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708410.148090765)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708410.22252242)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708411.150154954)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708411.433670608)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708489.22661486)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708486.965622276)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708487.86187345)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708488.054417543)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708488.020803455)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708487.812288076)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708488.446396954)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708487.1536056)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708487.167295534)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708487.359632907)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708487.24073456)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708486.9678677)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708487.822154475)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708486.926560951)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708487.485289283)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708487.653128313)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708487.715366956)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708487.296186447)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708488.085359056)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708486.990764327)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708488.24568794)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708488.376997432)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708488.24995544)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708488.647437308)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708488.494412107)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708488.282747116)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708488.34962272)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708488.827357816)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708488.642324258)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708489.56664528)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708489.07083092)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708489.304862896)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708489.478387516)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708489.116945228)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708489.731330588)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708490.474096475)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708490.64371221)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708490.456382217)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708489.452500883)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708489.805391427)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708489.553504689)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708489.919569584)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708489.63221626)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708491.257992184)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708490.422914164)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708498.785650138)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708493.250118027)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708489.859048093)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708489.92076397)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708489.73869216)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708490.219694808)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708491.399517065)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708493.518022655)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708493.373522796)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708493.22240743)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708492.735564424)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708492.729866767)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708492.783590784)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708503.640832378)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708493.234043077)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708498.64393464)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708493.391408823)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708492.709921272)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708492.787894152)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9708492.87488484)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708492.770893106)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708493.583997045)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708492.96674268)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708492.934044804)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708498.659059448)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708498.839997219)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708503.571782004)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708503.644806804)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708503.133528853)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708503.655442577)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708498.2307393)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708502.684112212)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708498.889757462)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708498.56328754)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708503.38869498)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708514.494067853)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708514.516156629)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708503.339177808)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708503.343244936)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708514.633015823)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708531.372180045)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708519.868877642)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708531.961128874)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708532.016717454)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708531.67995342)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708531.769276263)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708514.78878152)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708531.98817952)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708498.702706683)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708514.803442026)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708532.632026263)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708531.507377444)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708537.865447232)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708537.782345835)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708526.473402644)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708531.70393088)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708531.77122586)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708519.938558176)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708537.666984096)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708526.467871832)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708525.832152836)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708538.04775375)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708537.961853497)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708537.613800768)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708538.757834287)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708543.990689231)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708543.902618868)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708543.893284405)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708531.90166506)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708538.01691692)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708537.926339546)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708537.60976566)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708543.804588228)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708543.827887258)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708544.152854852)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708546.27979118)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9708546.170427492)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708547.11609898)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708547.09252895)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708547.077948367)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708544.10829593)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708547.34128573)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708544.024825515)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708544.096849713)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708546.35512711)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708547.301138552)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708547.452677052)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708548.504129993)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708548.196700236)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708548.932031864)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708547.444427095)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708547.094124973)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708549.217960227)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708549.060913453)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708547.592714667)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708548.86224205)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708549.073293528)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708548.662335709)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708550.392198896)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708548.610699136)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708548.429301666)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708548.329977833)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708548.846621148)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9708548.606762236)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708551.613018932)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708547.759715175)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708547.795792244)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708548.59426247)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708549.438950382)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708550.558260027)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708549.276035285)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708549.779026188)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708549.203526028)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708549.512079818)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708549.916522106)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708552.448933916)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708549.691201756)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708549.65210619)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708549.99194687)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708549.797186052)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708552.67389962)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708552.176850077)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708552.868151205)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708551.021425435)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708553.017568864)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708552.360867571)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708550.804919498)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708552.977635467)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708551.502610376)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708553.076753367)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708553.281473884)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9708550.515930224)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708552.295732988)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708553.474900443)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708553.059591139)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708550.39832102)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708559.46264108)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708553.75735621)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708553.114672324)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708553.037146661)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708553.833953928)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708553.55370514)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708553.453907242)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708553.91949385)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708558.85617466)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708553.934007471)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708573.974211678)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708552.862205446)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708553.0512766)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708558.63579207)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708559.526754716)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708574.275152927)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708559.1960854)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708559.3900955)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708574.597620444)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708593.151578618)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708593.132229727)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708559.10527713)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708576.147172509)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9708574.235044774)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708559.074580455)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708574.582709137)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708563.68094108)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708574.245380148)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708574.043222196)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708574.054909872)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708598.518275002)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708592.501186786)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708591.640639763)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708592.8821973)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708598.105183795)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708591.464844728)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708593.229978256)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708592.582501456)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708598.21859933)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708597.793934228)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708598.175176743)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708598.383513728)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708597.714933608)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708604.759374635)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708598.165556785)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708598.35191197)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708598.689685805)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708604.584917925)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708598.342616696)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708604.257812222)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708606.989723217)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708604.712227236)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed connector\n", "connections: ['deque([(, 9708607.607494863)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708604.35715965)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708608.19756376)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708607.512948412)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9708604.255771782)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708607.781416668)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708608.842734292)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9708607.419581376)])']\n", "connector: \n", "Evaluating workflow: 76%|███████▌ | 380/500 [03:40<03:49, 1.91s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:24:41.793\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a7d1d825542995ed0d165f5', 'answer': 'Lorman', 'question': 'In what city did Charlie Spiller play college football?', 'supporting_facts': [['Charlie Spiller', 2], ['Alcorn State University', 0]], 'context': [['1891 Purdue football team', ['The 1891 Purdue football team was an American football team that represented Purdue University during the 1891 college football season.', \" The team compiled a 4–0 record in the university's fourth season fielding an intercollegiate football team.\", ' For the 1891 season, Purdue hired Knowlton Ames as its football coach.', ' Ames played for Princeton from 1886 to 1889 and was considered one of the greatest players ever to play college football, after scoring 730 points for Princeton.', ' The 1891 Purdue team shut out all four opponents, outscoring Wabash, DePauw, Indiana, and Butler by a combined score of 192 to 0.', \" Purdue's 60–0 victory over Indiana was the first installment in a rivalry which later became noted for the award of the Old Oaken Bucket trophy.\"]], ['Ken McAlister', ['Kenneth H. McAlister (born April 15, 1960) is a former American football linebacker who played five seasons in the National Football League with the Seattle Seahawks, San Francisco 49ers and Kansas City Chiefs.', ' He played college basketball at the University of San Francisco and attended Oakland High School in Oakland, California.', ' He did not play college football and made the Seahawks roster in 1982.']], ['Jamal Anderson', ['Jamal Sharif Anderson (born September 30, 1972) is a former American football running back of the National Football League.', ' He was drafted by the Atlanta Falcons in the seventh round of the 1994 NFL Draft.', ' He played high school football at El Camino Real High School, where he was named to the CIF Los Angeles City Section 4-A All-City first team in 1989.', ' He went on to play college football at Moorpark College for the Moorpark College Raiders before playing at Utah.']], ['Seantrel Henderson', ['Seantrel Henderson (born January 21, 1992) is an American football offensive tackle for the Buffalo Bills of the National Football League (NFL).', ' He was drafted by the Bills in the seventh round of the 2014 NFL Draft.', ' He played college football at Miami.', ' Henderson attended Cretin-Derham Hall High School and originally signed a letter of intent to play college football at the University of Southern California, but was released from his commitment in July 2010 and eventually committed to the University of Miami.']], ['George Thomas (American football)', ['George Carroll Thomas, Jr. (March 4, 1928 – May 23, 1989) was an American football halfback and defensive back in the National Football League for the Washington Redskins and the New York Giants.', ' He was a standout high school basketball player, which led to his being recruited to play college basketball for Tulane University.', ' However, first year OU football coach, Jim Tatum, convinced him stay in Oklahoma and play college football at the University of Oklahoma.', \" Thomas was a standout for the Sooners, lettering in '46, '47,'48 and '49.\", ' He earned All-American status in 1949 List of Oklahoma Sooners football All-Americans.', ' Thomas graduated from OU with a degree in Business Administration in 1950.']], ['Charlie Spiller', ['Charlie Spiller (born October 18, 1983) is a former American football wide receiver.', ' He was signed by the Tampa Bay Buccaneers as a street free agent in 2008.', ' He played college football at Alcorn State.']], ['Baron Batch', ['Baron Batch (born December 21, 1987), self-styled \"The Artist\", a Pittsburgh-based entrepreneur and former American football running back who retired from the NFL in 2013.', ' He is known for his \"FREE\" art drops, where he posts pictures of giveaway paintings on Instagram and Twitter, leaving clues to their location.', ' He played college football at Texas Tech University.', ' Batch chose to play college football at Texas Tech University over offers from Northwestern University, Duke University, and New Mexico State University.', ' Batch is from Midland, Texas.', ' He is the owner and creator of Angry Man Salsa and creative director of Studio AM.', ' He is the brother of Brian Batch of the band Alpha Rev.']], ['Johnson Bademosi', ['Johnson Bademosi (born July 23, 1990) is an American football cornerback and special teamer for the New England Patriots of the National Football League (NFL).', ' He was signed by the Browns as an undrafted free agent in 2012.', ' He was a member of the football, rugby, and track and field teams at Gonzaga College High School and went on to play college football for Stanford University.']], ['Ross Travis', ['Ross John Travis (born January 9, 1993) is an American football tight end for the Kansas City Chiefs of the National Football League (NFL).', ' He played college basketball at Penn State and did not play college football.', ' He signed with the Chiefs in 2015.']], ['Alcorn State University', ['Alcorn State University (Alcorn) is a historically black comprehensive land-grant institution located northwest of Lorman, Mississippi in rural Claiborne County.', ' It was founded in 1871 by the Reconstruction-era legislature to provide higher education for freedmen.', ' It is the first black land grant college established in the United States.', ' The university is counted as a census-designated place and had a resident population of 1,017 at the 2010 census.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 76%|███████▌ | 381/500 [03:41<03:32, 1.79s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:24:41.800\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5ae16ea85542990adbacf790', 'answer': '31 July 1975', 'question': 'When did the UVF Mid-Ulster Brigade conducted the attack The Miami Showband killings?', 'supporting_facts': [['UVF Mid-Ulster Brigade', 0], ['UVF Mid-Ulster Brigade', 5], ['Miami Showband killings', 0]], 'context': [['The Miami Showband', ['The Miami Showband were one of the most successful and popular showbands in Ireland in the 1960s and 1970s.', \" Led at first by singer Dickie Rock, and later by Fran O'Toole, they had seven number one records on the Irish singles chart.\", ' In 1975 during The Troubles, when returning from a performance in County Down, Northern Ireland, three members of the band, Fran O\\'Toole, Tony Geraghty, and Brian McCoy, were killed in what became known as the \"Miami Showband massacre\".']], ['Andrew Robb and David McIlwaine killings', ['The Tandragee killings took place in the early hours of Saturday 19 February 2000 on an isolated country road outside Tandragee, County Armagh, Northern Ireland.', ' Two young Protestant men, Andrew Robb and David McIlwaine, were beaten and repeatedly stabbed to death in what was part of a Loyalist feud between the loyalist Ulster Volunteer Force (UVF) and their rivals, the breakaway Loyalist Volunteer Force (LVF).', ' The men were not members of any loyalist paramilitary organisation.', ' It later emerged in court hearings that Robb had made disparaging remarks about the killing of UVF Mid-Ulster Brigade leader Richard Jameson by an LVF gunman the previous month.', ' This had angered the killers, themselves members of the Mid-Ulster UVF, and in retaliation they had lured the two men to the remote lane on the outskirts of town, where they killed and mutilated them.']], ['Loyalist Volunteer Force', ['The Loyalist Volunteer Force (LVF) is a small Ulster loyalist paramilitary group in Northern Ireland.', ' It was formed by Billy Wright in 1996 when he and his unit split from the Ulster Volunteer Force (UVF) after breaking its ceasefire.', \" They had belonged to the UVF's Mid-Ulster Brigade and Wright had been the brigade's commander.\", ' In a two-year period from August 1996, the LVF waged a paramilitary campaign with the stated goal of combatting Irish republicanism.', ' During this time it killed at least 14 people in gun and bomb attacks.', ' Almost all of its victims were Catholic civilians who were killed at random.', ' The LVF called off its campaign in August 1998 and decommissioned some of its weapons, but in the early 2000s a loyalist feud led to a number of killings.', ' Since then, the LVF has been largely inactive, but its members are believed to have been involved in rioting and organized crime.', ' In 2015, the security forces stated that the LVF \"exists only as a criminal group\" in Mid-Ulster and Antrim.']], ['UVF Mid-Ulster Brigade', ['UVF Mid-Ulster Brigade formed part of the loyalist paramilitary Ulster Volunteer Force in Northern Ireland.', ' The brigade was established in Lurgan, County Armagh in 1972 by its first commander Billy Hanna.', ' The unit operated mainly around the Lurgan and Portadown areas.', ' Subsequent leaders of the brigade were Robin Jackson, known as \"The Jackal\", and Billy Wright.', ' The Mid-Ulster Brigade carried out many attacks, mainly in Northern Ireland, especially in the South Armagh area, but it also extended its operational reach into the Republic of Ireland.', ' Two of the most notorious attacks in the history of the Troubles were carried out by the Mid-Ulster Brigade: the 1974 Dublin and Monaghan bombings and the Miami Showband killings in 1975.', ' Members of the Mid-Ulster Brigade were part of the Glenanne gang which the Pat Finucane Centre has since linked to at least 87 lethal attacks in the 1970s.']], ['Harris Boyle', ['Harris Boyle (1953 – 31 July 1975) was an Ulster Defence Regiment (UDR) soldier and a high-ranking member of the Ulster Volunteer Force (UVF), a Northern Irish loyalist paramilitary organisation.', ' Boyle was implicated in the 1974 Dublin and Monaghan bombings, and took part in the attack at Buskhill, County Down when an armed UVF gang wearing British Army uniforms ambushed The Miami Showband at a bogus military checkpoint.', ' The popular Irish cabaret band was driving home to Dublin after a performance in Banbridge.', \" He was one of the two gunmen killed when the bomb they had loaded onto the band's minibus exploded prematurely.\", ' He is sometimes referred to as Horace Boyle.']], ['John Francis Green', ['John Francis Green (18 December 1946 – 10 January 1975), was a leading member of the North Armagh Brigade of the Provisional Irish Republican Army, holding the rank of Staff Captain and Intelligence Officer.', ' He was killed in a farmhouse outside Castleblayney, County Monaghan, by members of the Mid-Ulster Brigade of the Ulster Volunteer Force (UVF).', \" According to Secret Intelligence Service operative Captain Fred Holroyd, British Army Captain Robert Nairac was involved in Green's killing.\", \" Green's was one of the 87 killings attributed by the Pat Finucane Centre to the group of loyalist extremists known as the Glenanne gang.\", ' No one was ever prosecuted for the killing.']], ['Miami Showband killings', ['The Miami Showband killings (also called the Miami Showband Massacre) was an attack by the Ulster Volunteer Force (UVF), a loyalist paramilitary group, on 31 July 1975.', ' It took place on the A1 road at Buskhill in County Down, Northern Ireland.', \" Five people were killed, including three members of The Miami Showband, who were then one of Ireland's most popular cabaret bands.\"]], ['1991 Cappagh killings', ['The 1991 Cappagh killings was a gun attack by the loyalist Ulster Volunteer Force (UVF) on 3 March 1991 in the village of Cappagh, County Tyrone, Northern Ireland.', \" A unit of the UVF's Mid-Ulster Brigade drove to the staunchly republican village and shot dead three Provisional IRA volunteers and a Catholic civilian at Boyle's Bar.\"]], ['Wesley Somerville', ['William Wesley Somerville (c. 1941 – 31 July 1975) was a Northern Irish loyalist, who held the rank of lieutenant in the illegal Ulster Volunteer Force\\'s (UVF) Mid-Ulster Brigade during the period of conflict known as \"the Troubles\".', \" He also served as a member of the British state's legal Ulster Defence Regiment (UDR).\", ' Somerville was part of the UVF unit that ambushed the Irish cabaret band The Miami Showband at Buskhill, County Down, which resulted in the deaths of three of the bandmembers.', \" Somerville was killed, along with Harris Boyle, when the bomb they had loaded onto the band's minibus exploded prematurely.\", \" His brother, John James Somerville (a former UDR soldier) was one of the three men convicted of the murders of bandmembers Brian McCoy, Fran O'Toole and Tony Geraghty.\"]], ['Richard Jameson (loyalist)', [\"Richard Jameson (c. 1953 – 10 January 2000), was a Northern Irish businessman and loyalist, who served as the leader of the paramilitary Ulster Volunteer Force's (UVF) Mid-Ulster Brigade.\", ' He was killed outside his Portadown home during a feud with the rival Loyalist Volunteer Force (LVF), the breakaway organisation founded by former Mid-Ulster UVF commander Billy Wright after he and the Portadown unit of the Mid-Ulster Brigade were officially stood down by the Brigade Staff (Belfast leadership) in August 1996.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "\u001b[32m2025-12-07 12:24:47.570\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 9 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:24:47.574\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 9 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 77%|███████▋ | 383/500 [03:47<03:58, 2.04s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:24:47.695\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 8 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 77%|███████▋ | 385/500 [03:47<02:58, 1.55s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:24:47.728\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 8 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:24:47.767\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 8 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:24:47.856\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 8 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:24:47.856\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 8 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 78%|███████▊ | 388/500 [03:47<01:55, 1.03s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:24:47.863\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 8 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 78%|███████▊ | 391/500 [03:53<02:28, 1.36s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:24:54.784\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 78%|███████▊ | 392/500 [03:54<02:19, 1.29s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:24:55.050\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5ac02835554299294b218f2e', 'answer': 'Manchester', 'question': 'Frank Lamson-Scribner was adopted by a family near which town in Kennebec County?', 'supporting_facts': [['Frank Lamson-Scribner', 0], ['Frank Lamson-Scribner', 1], ['Manchester, Maine', 0]], 'context': [['Winslow, Maine', ['Winslow is a town and census-designated place in Kennebec County, Maine, United States, along the Kennebec River.', ' The population was 7,794 at the 2010 census.']], ['Frank Lamson-Scribner', ['Franklin Pierce Lamson was born April 19, 1851 in Cambridgeport, Massachusetts.', ' His parents Joseph Sanborn and Eunice Ellen (Winslow) Lamson died when he was 3 years old and he was adopted by the Virgil Scribner family near Manchester, Maine.', ' He received preparatory education at Hebron Academy, Kents Hill School, and Coburn Classical Institute and graduated from Maine State College of Agriculture and Mechanic Arts in 1873.']], ['Kennebec, North Carolina', ['Kennebec is a small unincorporated community in southern Wake County, North Carolina along the border of Harnett County.', ' The community is situated along North Carolina Highway 55 and is the site of the Fuquay-Angier Airfield (Kennebec Flying Club).', ' Much of the area has been recently annexed by the Harnett County town of Angier.', ' Kennebec was named for Kennebec County, Maine .', ' The community was also a stop on the former Durham and Southern Railway.']], ['Two Cent Bridge', ['The Ticonic Footbridge, popularly known as the Two Cent Bridge, is a suspension bridge that spans the Kennebec River between the city of Waterville and the town of Winslow in Kennebec County, Maine.', ' It is one of the oldest surviving wire-cable steel suspension bridges and also is considered to be the last known extant toll footbridge in the United States.']], ['Jon A. Lund', ['Jon A. Lund (born November 6, 1928) is an American attorney and politician from Maine.', ' Lund, a Republican, served as Maine Attorney General from 1972-1975.', ' Prior to his time as the first full-time attorney general in Maine history, Lund was an assistant country attorney for Kennebec County, member of the Augusta City Council and two-time county attorney for Kennebec County.', ' He was also elected to the Maine House of Representatives (1965-1966; 1969-1972) and Maine Senate (1967-1968).']], ['Kennebec County, Maine', ['Kennebec County is a county located in the U.S. state of Maine, in the United States.', ' As of the 2010 census, the population was 122,151.', ' Its county seat is Augusta, the state capital.', ' The county was established on 20 February 1799 from portions of Cumberland and Lincoln Counties.', ' The name Kennebec comes from the Eastern Abenaki \"/kínipekʷ/\", meaning \"large body of still water, large bay.\"']], ['Manchester, Maine', ['Manchester is a town in Kennebec County, Maine, United States, located at .', ' The population was 2,580 at the 2010 census.', ' The southern part of the town bordering Cobbosseecontee Lake is a popular recreation spot in central Maine, and part of the Winthrop Lakes Region.', ' Manchester is included in the Augusta, Maine micropolitan New England City and Town Area.']], ['Kennebec County Courthouse', ['The Kennebec County Courthouse is located at 95 State Street in Augusta, Maine, the state capital and county seat of Kennebec County.', ' Built in 1829 and twice enlarged, it is one of the oldest examples of Greek Revival architecture in the state, and its earliest known example of a Greek temple front.', ' The building, which is now mostly taken up by county offices, was listed on the National Register of Historic Places in 1974.']], ['China Lake (Maine)', ['China Lake is a lake in Kennebec County, Maine.', ' Located northeast of the state capital of Augusta, China Lake is situated in the towns of China and Vassalboro.', ' China Lake has two large basins connected by a narrow neck.', ' The elongated eastern basin with an average depth of less than 30 ft is entirely within the town of China, and has an irregular shoreline heavily developed with residences and seasonal cottages.', ' The more nearly circular western basin extending into East Vassalboro is as deep as 85 ft , and shoreline development around the western basin has been discouraged to allow use as a water supply for Waterville and Winslow.', ' The western basin overflows into Outlet Stream in the town of Vassalboro.', ' Outlet Stream flows 7 mi north to discharge into the Sebasticook River in Winslow 1 mi upstream of the Kennebec River.']], ['Waterville, Maine', ['Waterville is a city in Kennebec County of the U.S. state of Maine, United States, on the west bank of the Kennebec River.', ' Home to Colby College and Thomas College, the population was 15,722 at the 2010 census.', ' Waterville is also the second city which makes up the \"Augusta-Waterville, ME Micropolitan Statistical Area\".']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 79%|███████▊ | 393/500 [03:54<02:00, 1.12s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:24:55.061\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5abbee315542993f40c73c1d', 'answer': 'yes', 'question': 'Could you read both Bicycling and National Review', 'supporting_facts': [['Bicycling (magazine)', 0], ['National Review', 0]], 'context': [['National Review', ['National Review (NR) is an American semi-monthly conservative editorial magazine focusing on news and commentary pieces on political, social, and cultural affairs.', ' The magazine was founded by the author William F. Buckley Jr. in 1955.', ' It is currently edited by Rich Lowry.']], [\"Kate O'Beirne\", [\"Kate Walsh O'Beirne (September 23, 1949\\xa0– April 23, 2017) was the President of National Review Institute.\", ' She was the Washington editor of \"National Review\".', ' Her column, \"Bread and Circuses,\" covered Congress, politics, and U.S. domestic policy.']], ['Kevin D. Williamson', ['Kevin Daniel Williamson (born September 18, 1972) is the roving correspondent for \"National Review.\"', ' He is also the theater critic for \"The New Criterion\".', ' He was previously deputy managing editor at \"National Review\".']], ['Jim Geraghty', ['Jim Geraghty is a conservative blogger and regular contributor to \"National Review Online\" and \"National Review\".', ' In addition to writing columns for \"National Review\", Geraghty also blogs for National Review Online and is a former reporter for States News Service.']], ['National Review Board', ['The National Review Board (full name: National Review Board for the Protection of Children and Young People) is a committee created in 2002 by the United States Conference of Catholic Bishops in order to monitor the implementation of the Charter for the Protection of Children and Young People in the wake of the clerical abuse scandal in the United States.', ' The Board was also charged with investigating the scandal, which it did in part by commissioning the John Jay College to conduct a survey of Church records in order to define the nature and scope of the abuse committed by priests over the period between 1950 and 2002.', ' The results of that survey were released in 2004 in what has come to be known as the \"John Jay Report\" or \"Nature and Scope report.\"', ' In parallel with the John Jay College survey the Board conducted interviews with a variety of people, both inside and outside the Church, who were well placed to comment on the scandal, and on the basis of these interviews prepared a more broad-ranging report of its own.', ' Whereas the John Jay College report was (as intended) primarily a factual summary of the data collected in the College’s survey of Church records, the Board’s own report sought to interpret these data and its other findings in order to explain why the “epidemic” of clerical abuse had occurred and to identify the appropriate steps to avoid any repetition.']], ['Mark Krikorian', ['Mark Krikorian has been the executive director of the Center for Immigration Studies, a think-tank in Washington, D. C., since 1995.', ' The Center describes itself as an \"independent, non-partisan research organization\" in Washington, D. C., that examines and critiques the impact of immigration on the United States.', ' Animated by a \"pro-immigrant, low-immigration vision which seeks fewer immigrants, but a warmer welcome for those admitted\", the Center was established in 1985 to provide immigration research.', ' Krikorian is a regular contributor to the conservative publication \"National Review\", and is a regular participant at \"National Review Online\\'s\" \"The Corner.\"']], ['Review', ['A review is an evaluation of a publication, service, or company such as a movie (a movie review), video game (video game review), musical composition (music review of a composition or recording), book (book review); a piece of hardware like a car, home appliance, or computer; or an event or performance, such as a live music concert, play, musical theater show, dance show, or art exhibition.', \" In addition to a critical evaluation, the review's author may assign the work a rating to indicate its relative merit.\", ' More loosely, an author may review current events, trends, or items in the news.', ' A compilation of reviews may itself be called a review. \"', 'The New York Review of Books\", for instance, is a collection of essays on literature, culture, and current affairs. \"', 'National Review\", founded by William F. Buckley, Jr., is an influential conservative magazine, and \"Monthly Review\" is a long-running socialist periodical.']], ['Charles C. W. Cooke', ['Charles C. W. Cooke (born November 4, 1984) is the editor of \"National Review Online\".', ' He took the role over after Rich Lowry stepped down in June, 2016 (Lowry remains the editor-in-chief of \"National Review\").', ' Cooke is the author of \"The Conservatarian Manifesto\" and a frequent guest on HBO\\'s \"Real Time with Bill Maher\".', ' In addition to \"National Review\", he has written for the \"New York Times\", the \"Washington Post\", and the \"Los Angeles Times\".', ' Along with Kevin D. Williamson, he hosts the popular \"Mad Dogs and Englishmen\" podcast.']], ['Bicycling (magazine)', ['Bicycling is a cycling brand published by Rodale, Inc. in Emmaus, Pennsylvania.', ' \"Bicycling\" claims to be the world’s largest cycling magazine.']], ['The Human Life Review', ['The Human Life Review is a quarterly journal published by the Human Life Foundation since 1975.', ' It is devoted to explorations of life issues, primarily abortion, as well as neonaticide, medical genetics, prenatal testing, human cloning, fetal tissue experimentation, euthanasia and assisted suicide, and also publishes articles dealing with more general questions of family and society.', ' It was founded by James Patrick McFadden, formerly associate publisher of \"National Review\", who had also founded the Human Life Foundation, and is now edited by his daughter, Maria McFadden.', ' It was launched from the offices of \"National Review\", with the support of William F. Buckley.']]], 'type': 'comparison', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 79%|███████▉ | 395/500 [03:54<01:23, 1.26it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:24:55.219\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5abc3d215542993a06baf8a5', 'answer': 'Acharacle', 'question': 'Loch Shiel is part of the river that drains into the sea near the castle that sits west of what town?', 'supporting_facts': [['Loch Shiel', 1], ['Castle Tioram', 0], ['Castle Tioram', 1]], 'context': [['Angle, Pembrokeshire', ['Angle (Welsh: \"Angl\" ) is a village and community on a narrow peninsula on the very south-west tip of Wales in Pembrokeshire.', \" It has two public houses, a school, post office, a castle, St Mary's church and a sandy beach to the west of the village.\", ' The nearest railway station is Pembroke, from where there is a bus link.', ' The Angle lifeboat received silver medals in 1878 for rescuing the crew of the \"Loch Shiel\" from rocks near Thorn Island.', ' The ship had been carrying a cargo of whisky and beer.']], ['Polloch', ['Polloch is a remote hamlet, located at the north shore of the River Polloch, in an inlet that flows into Loch Shiel, in Inverness-shire, Scottish Highlands and is in the Scottish council area of Highland.']], ['The Rough Bounds', ['The Rough Bounds (Scottish Gaelic: \"Na Garbh Chriochan\" ), in the Scottish Highlands, is the area of West Inverness-shire bounded by Loch Hourn, Loch Shiel, and Loch Moidart, consisting of the districts of Knoydart, North Morar, Arisaig and Moidart.', ' The area is famous for its wildness and inaccessibility and remains very sparsely populated.']], ['Sgùrr Ghiubhsachain', ['Sgùrr Ghiubhsachain is a mountain in the Lochaber area in the west of Scotland.', ' Its summit is the highest point in a group of mountains that stand south of Glenfinnan, to the south east of the northern part of Loch Shiel.', ' It is considerably lower than the nearby Nevis range, but it is a long way from a public road.', ' Its slopes are steep and rugged on all sides and are devoid of paths.', ' Despite easy access to the trail head from the road from Fort William to Mallaig, an individual or party that climbs this mountain may be alone there, even on a fine day in the summer.']], ['Shiel Bridge', ['Shiel Bridge is a village on the south east mouth of Loch Duich and confluence of the small loch in Loch Shiel and the River Shiel, in Lochalsh, Scottish Highlands and is in the council area of Highland.']], ['River Shiel', ['The River Shiel (Scottish Gaelic: Abhainn Seile) is a four kilometre long river in Acharacle, Highland.', ' It flows out of the Loch Shiel into the sea at Dorlin.']], ['Castle Tioram', ['Castle Tioram ( ) (Scottish Gaelic: \"Caisteal Tioram\" , meaning \"dry castle\") is a ruined castle that sits on the tidal island Eilean Tioram in Loch Moidart, Lochaber, Highland, Scotland.', ' It is located west of Acharacle, approximately 80 km from Fort William.', ' Though hidden from the sea, the castle controls access to Loch Shiel.', ' It is also known to the locals as \"Dorlin Castle\".', ' The castle is a scheduled monument.']], ['Loch Shiel', ['Loch Shiel (Scottish Gaelic: Loch Seile) is a 17+1/2 mi freshwater loch, 120 m (393 ft) deep, situated 12.4 miles west of Fort William in Lochaber, Highland, Scotland.', ' Its nature changes considerably along its length, being deep and enclosed by mountains in the north east and shallow surrounded by bog and rough pasture in the south west, from which end the 4 km River Shiel drains to the sea in Loch Moidart near Castle Tioram.']], ['Moidart', ['Moidart ( , Scottish Gaelic: Mùideart ] ) is part of the remote and isolated area of Scotland, west of Fort William, known as the Rough Bounds.', ' Moidart itself is almost surrounded by bodies of water : Loch Shiel cuts off the eastern boundary of the district (along a south-south-west to north-north-east line), and continues along part of the southern edge; the remainder of the southern edge is cut off by Loch Moidart; the north is cut off by Loch Morar and Loch Ailort.']], ['Glenfinnan Viaduct', ['The Glenfinnan Viaduct is a railway viaduct on the West Highland Line in Glenfinnan, Inverness-shire, Scotland.', ' Located at the top of Loch Shiel in the West Highlands of Scotland, the viaduct overlooks the Glenfinnan Monument and the waters of Loch Shiel.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "\u001b[32m2025-12-07 12:24:55.398\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 79%|███████▉ | 397/500 [03:54<00:59, 1.74it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 80%|███████▉ | 399/500 [03:56<01:03, 1.59it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.3333333333333333, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 81%|████████ | 403/500 [03:56<00:30, 3.22it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 81%|████████ | 406/500 [03:56<00:20, 4.57it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 82%|████████▏ | 408/500 [03:57<00:18, 5.07it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.18181818181818182, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 82%|████████▏ | 412/500 [03:57<00:13, 6.56it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 84%|████████▍ | 419/500 [03:57<00:06, 12.23it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 84%|████████▍ | 421/500 [03:58<00:05, 13.22it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 85%|████████▌ | 426/500 [03:58<00:06, 11.85it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8571428571428571, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 86%|████████▌ | 428/500 [03:58<00:06, 10.47it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 86%|████████▌ | 430/500 [03:58<00:06, 10.90it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8571428571428571, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 87%|████████▋ | 433/500 [03:59<00:05, 11.89it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 88%|████████▊ | 442/500 [04:01<00:09, 6.04it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5454545454545454, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.8235294117647058, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 89%|████████▉ | 445/500 [04:01<00:07, 7.41it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 90%|█████████ | 450/500 [04:01<00:04, 10.42it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 91%|█████████ | 453/500 [04:02<00:05, 8.52it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 91%|█████████ | 455/500 [04:02<00:06, 7.35it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 93%|█████████▎| 463/500 [04:03<00:03, 12.24it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 94%|█████████▎| 468/500 [04:03<00:02, 12.65it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 94%|█████████▍| 469/500 [04:23<01:01, 1.97s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:25:40.924\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a8e068b5542995085b37384', 'answer': 'yes', 'question': 'Are Ferocactus and Silene both types of plant?', 'supporting_facts': [['Ferocactus', 0], ['Silene', 0]], 'context': [['Silene latifolia', ['Silene latifolia (formerly \"Melandrium album\"), the white campion is a dioecious flowering plant in the family Caryophyllaceae, native to most of Europe, Western Asia and Northern Africa.', ' It is a herbaceous annual, occasionally biennial or a short-lived perennial plant, growing to between 40-80 centimetres tall.', ' It is also known in the USA as bladder campion but should not be confused with \"Silene vulgaris\", which is more generally called Bladder Campion.']], ['Silene tomentosa', ['Silene tomentosa, the Gibraltar campion, is a very rare flowering plant of the genus \"Silene\" and the family Caryophyllaceae.', ' It is a woody-based perennial about 40cm high, with bilobed flowers ranging from pink to pale violet and is endemic to Gibraltar.']], ['Silene conoidea', ['Silene conoidea is a species of flowering plant in the pink family known by the common names weed silene and large sand catchfly.', ' It is native to Eurasia, and it is known in other parts of the world, such as western North America, as a weed.', ' It is an annual herb growing up to a meter in height with a hairy, partially glandular stem.', ' The lance-shaped leaves are up to 12 centimeters long near the base of the plant and smaller farther up.', ' The flower is enclosed in an inflated, hairy, glandular calyx of fused sepals which is ridged with many veins.', ' It is open at the top, revealing five bright pink petals.']], ['Silene', ['Silene is a genus of flowering plants in the family Caryophyllaceae.', ' Containing approximately 700 species, it is the largest genus in the family.', ' Common names include campion (which is shared with the related genus \"Lychnis\") and catchfly.', ' Many \"Silene\" species are widely distributed, particularly in the northern hemisphere.']], ['Silene stenophylla', ['Silene stenophylla is a species of flowering plant in the family Caryophyllaceae.', ' Commonly called narrow-leafed campion, it is a species in the genus \"Silene\".', ' It grows in the Arctic tundra of far eastern Siberia and the mountains of Northern Japan.', ' Frozen samples, estimated via radiocarbon dating to be around 32,000 years old, were discovered in the same area as current living specimens, and in 2012 a team of scientists successfully regenerated a plant from the samples.']], ['Silene menziesii', [\"Silene menziesii is a species of flowering plant in the pink family known by the common names Menzies' campion and Menzies' catchfly.\", ' It is native to western North America from Alaska through the western half of Canada to the southwestern United States.', ' It can be found in many types of habitat and it is quite common in much of its range.', ' It is variable in morphology and there are a number of varied subtaxa.', ' In general, it is a perennial herb growing from a caudex, appearing matlike, decumbent, or erect, with stems a few centimeters to over half a meter long.', ' It is usually hairy in texture, with upper parts bearing sticky glandular hairs.', ' The leaves are lance-shaped, oppositely arranged in pairs, and a few centimeters in length, upper leaves usually smaller than lower.', ' Flowers may occur in a cyme at the top of the stem, or in leaf axils, or both.', ' Each is encapsulated in a hairy, veined calyx of fused sepals.', ' The petals are white with two lobes at the tips.', ' The plant is dioecious with male and female plants producing different flowers.', ' The male and female flower types look the same externally; the stamens are reduced in female plants and the stigmas are reduced in the male.']], ['Silene suksdorfii', [\"Silene suksdorfii is a species of flowering plant in the pink family known by the common names Suksdorf's silene, Suksdorf's catchfly and Cascade alpine campion.\", ' It is native to the Pacific Northwest of the United States, where it occurs from Washington and Idaho to northern California.', ' It is mainly an alpine species, growing in the talus of high mountain slopes.', ' It can also be found below the tree line in forested subalpine habitat.', ' It is a squat perennial herb producing several erect stems from a leafy, woody caudex.', ' It generally takes a clumpy form.', ' The stems grow up to 10 or 15 centimeters tall and are hairy in texture, with glandular, sticky areas on the upper parts.', ' The leaves occur in tufts around the caudex.', ' They are fleshy and coated in soft hairs.', ' Solitary flowers arise on erect peduncles.', ' Each is encapsulated in an inflated calyx of fused sepals, which is starkly purple-veined and has purplish glandular hairs.', ' The petals are white or purple-tinged and have two lobes at their tips and appendages at their bases.']], ['Ferocactus', ['Ferocactus is a genus of large barrel-shaped cacti, mostly with large spines and small flowers.', ' There are about 30 species included in the genus.', ' They are found in the southwestern United States and northwestern Mexico.']], ['Silene antirrhina', ['Silene antirrhina is a species of flowering plant in the pink family known by the common names sleepy silene and sleepy catchfly.', ' It is native to the Americas, where it is widespread throughout North America and parts of South America.', ' It is known in Europe as an introduced species.']], ['Sex determination in Silene', ['\"Silene\" are a flowering plant that evolved a dioecious reproductive system.', ' This is made possible through heteromorphic sex chromosomes expressed as XY.', ' \"Silene\" recently evolved sex chromosomes 5-10 million years ago and are widely used by geneticists and biologists to study the mechanisms of sex determination since they are one of only 39 species across 14 families of angiosperm that possess sex-determining genes.', ' \"Silene\" are studied because of their ability to produce offspring with a plethora of reproductive systems.', ' The common inference drawn from such studies is that the sex of the offspring is determined by the Y chromosome.']]], 'type': 'comparison', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 94%|█████████▍| 470/500 [04:40<01:54, 3.81s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:25:40.982\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5ab8854555429934fafe6e0c', 'answer': 'drawings', 'question': 'Works by Hanna Leena Kristiina Varis are part of a collection in a museum that houses approximately 65,000 what?', 'supporting_facts': [['Hanna Varis', 0], ['Hanna Varis', 4], ['Albertina', 0], ['Albertina', 1]], 'context': [['Amistad Dam Port of Entry', ['The Amistad Dam Port of Entry is a port of entry into the United States from Mexico.', ' It was built when Amistad Dam was completed in 1969.', ' The Dam was a bi-national effort to establish flood control on the Rio Grande and provide sources of water.', ' Although US Department of Transportation statistics combine traffic counts with Del Rio Texas Port of Entry, approximately 65,000 vehicles crossed the dam into the US in 2005.']], ['Pakistan Air Force', ['The Pakistan Air Force (PAF) (Urdu: \\u200e —\"Pāk Fizāʾiyah\" , , reporting name: PAF) is the aerial warfare branch of the Pakistan Armed Forces, tasked primarily with the aerial defence of Pakistan, with a secondary role of providing air support to the Pakistan Army and the Pakistan Navy.', ' The PAF also has a tertiary role of providing strategic air transport and logistics capability to Pakistan.', ' The PAF employs approximately 65,000 full-time personnel (including approximately 3,000 pilots) and currently operates 883 aircraft.']], ['Remington Model 51', ['The Remington Model 51 is a small pocket pistol designed by John Pedersen and manufactured by Remington Arms in the early 20th century for the American civilian market.', ' Remington manufactured approximately 65,000 Model 51 pistols in .32 ACP and .380 ACP calibers from 1918 to 1927, though small numbers were assembled into the mid-1930s.']], ['United Negro College Fund', ['The United Negro College Fund, also known as UNCF or the United Fund, is an American philanthropic organization that funds scholarships for black students and general scholarship funds for 37 private historically black colleges and universities.', ' UNCF was incorporated on April 25, 1944 by Frederick D. Patterson (then president of what is now Tuskegee University), Mary McLeod Bethune, and others.', ' UNCF is headquartered at 1805 7th Street, NW in Washington, D.C.', ' In 2005, UNCF supported approximately 65,000 students at over 900 colleges and universities with approximately $113 million in grants and scholarships.', ' About 60% of these students are the first in their families to attend college, and 62% have annual family incomes of less than $25,000.', ' UNCF also administers over 450 named scholarships.']], ['Karaboro languages', ['The Karaboro languages are spoken in Burkina Faso by approximately 65,000 people (SIL 1995/1991).', ' They belong to the Senufo subfamily, but are separated from other Senufo languages by a small band of unrelated languages.', ' Within Senufo they are thought to be most closely related to the Senari languages.']], ['Marovoay', ['Marovoay ] is a city and commune (commune urbaine, Malagasy: \"kaominina\" ) in north-western Madagascar.', ' It belongs to the district of Marovoay, which is a part of Boeny Region.', ' The population of the commune was estimated to be approximately 65,000 in 2001 commune census.']], ['Darayim District', ['Darayim is a district in Badakhshan Province, Afghanistan.', ' It was created in 2005 from part of Fayzabad District and is home to approximately 65,000 residents.']], ['Hanna Varis', ['Hanna Leena Kristiina Varis (b. 1959 in Kuusankoski) is a Finnish graphic artist and painter.', ' She earned a Master of Arts degree from the Aalto University School of Arts, Design and Architecture in 1990.', ' She participated in the NUROPE, Nomadic University for Art, Philosophy and Enterprise in Europe, in 2006-2010.', ' She has held over 70 solo exhibitions and participated at over 140 group exhibitions.', ' Her works are part of major art collections in Finland and abroad, such as the Kiasma, Amos Anderson Art Museum, and Helsinki Art Museum in Helsinki, Wäinö Aaltonen Museum of Art in Turku, and Albertina Museum in Vienna.']], ['Albertina', ['The Albertina is a museum in the Innere Stadt (First District) of Vienna, Austria.', ' It houses one of the largest and most important print rooms in the world with approximately 65,000 drawings and approximately 1 million old master prints, as well as more modern graphic works, photographs and architectural drawings.', ' Apart from the graphics collection the museum has recently acquired on permanent loan two significant collections of Impressionist and early 20th-century art, some of which will be on permanent display.', ' The museum also houses temporary exhibitions.']], ['Gwangjang Market', ['Gwangjang Market, previously Dongdaemun Market, is a traditional street market in Jongno-gu, Seoul, South Korea.', ' The market is one of the oldest and largest traditional markets in South Korea, with more than 5000 shops and 20,000 employees in an area of 42000 m2 .', ' Approximately 65,000 people visit the market each day.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 94%|█████████▍| 472/500 [04:40<01:20, 2.87s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:25:41.204\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a89ab1355429946c8d6e996', 'answer': 'yes', 'question': 'Is Carnegie Hall Tower located in the same city as Staten Island?', 'supporting_facts': [['Carnegie Hall Tower', 0], ['Staten Island', 0]], 'context': [['Staten Island Academy', ['Staten Island Academy is a coeducational, college-preparatory day school located on a 14 acre campus in Staten Island in New York City, United States.', ' Founded in 1884 by Anton Methfessel, it is the oldest private school on Staten Island, and is the only independent school (non-public, non-religious) in the borough.', ' It educates students from pre-Kindergarten through grade 12 high school.', ' Current enrollment is 390 students and offers a student to teacher ratio of 7:1.', ' Albert Cauz is the current head of school.', ' The school is composed of three divisions: Lower School, Pre-K-Gr.', ' 4; Middle School, Gr.', ' 5-8; Upper School, Gr.', ' 9-12.', ' The Head of Lower, Middle and Upper School is Eileen Corigliano.', ' The campus has seven buildings: the Early Childhood Building, the Art Barn, Haugen Hall, Kearns Hall, Crowe Hall, Alumni Hall and the OJ Buck Gymnasium.', \" The school's accreditations include the Middle States Association of Colleges and Schools, and the New York State Association of Independent Schools.\", ' It is chartered and registered by the Board of Regents, University of the State of New York.']], ['Staten Island Ferry', ['The Staten Island Ferry is a passenger ferry route operated by the New York City Department of Transportation.', ' It runs 5.2 mi in New York Harbor between the New York City boroughs of Manhattan and Staten Island.', ' The ferry operates 24/7, running every 15 to 20 minutes during peak hours and every 30 minutes at other times.', ' Since 1997, the Staten Island Ferry has been fare-free, though historically, it has charged a relatively low fare compared to other modes of transit in the area.', ' The Staten Island Ferry is one of several ferry systems in the New York City area, besides NYC Ferry, New York Water Taxi, and NY Waterway.']], ['Richmond County Courthouse (Staten Island)', ['The Richmond County Courthouse is a 1919 municipal courthouse in the civic center of St. George in the borough of Staten Island in New York City (Richmond County is coextensive with Staten Island).', \" The neoclassical style courthouse is on Richmond Terrace next to Staten Island's Borough Hall and across the street from the Staten Island Ferry terminal.\"]], ['Staten Island', ['Staten Island is one of the five boroughs of New York City in the U.S. state of New York.', ' In the southwest of the city, Staten Island is the southernmost part of both the city and state of New York, with Conference House Park at the southern tip of the island and the state.', ' The borough is separated from New Jersey by the Arthur Kill and the Kill Van Kull, and from the rest of New York by New York Bay.', ' With a 2016 Census-estimated population of 476,015, Staten Island is the least populated of the boroughs but is the third-largest in area at 58 sqmi .', ' Staten Island is the only borough of New York with a non-Hispanic White majority.']], ['Carnegie Hall Tower', ['Carnegie Hall Tower is a 60-story skyscraper located on 57th Street in New York City.', ' Part of a cluster of four tall buildings (along with CitySpire Center, Metropolitan Tower and One57), the tower was built in an architectural style in harmony with its western neighbor Carnegie Hall, a New York landmark.']], ['St. George, Staten Island', ['St. George is a neighborhood on the northeastern tip of Staten Island in New York City, where the Kill Van Kull enters Upper New York Bay.', ' It is the most densely developed neighborhood on Staten Island, and the location of the administrative center for the borough and for the coterminous Richmond County.', ' The Staten Island terminal of the Staten Island Ferry is located here, as well as the northern terminus of the Staten Island Railway.', ' St. George is bordered on the south by the neighborhood of Tompkinsville and on the west by the neighborhood of New Brighton.']], ['Staten Island Community Board 1', [\"Staten Island Community Board 1 is a local government unit of the city of New York, encompassing the Staten Island neighborhoods of Arlington, northern Castleton Corners, Clifton Concord, Elm Park, Fort Wadsworth, northern Graniteville, Grymes Hill, Livingston, Mariners' Harbor, northern Meiers Corners, New Brighton, Port Ivory, Port Richmond, Randall Manor, Rosebank, Staten Island, St. George, Shore Acres, Silver Lake, Stapleton, Sunnyside, Tompkinsville, West Brighton, Westerleigh, and northern Willowbrook.\", ' Community Board 1 is essentially the entire area of Staten Island north of the Staten Island Expressway.']], ['Staten Island Technical High School', ['Staten Island Technical High School, commonly called Staten Island Tech or SITHS, was founded in 1988.', ' Located in Staten Island, New York City, the school is operated by the New York City Department of Education.', ' In 2005, Staten Island Tech became the only Specialized High School in Staten Island.', ' It consistently ranks among the best schools in New York City in graduation rate, Regents test scores, and attendance.', \" In 2012, SITHS was ranked #1 on the New York Post's list of the city's best high schools, #77 in the nation on U.S. News & World Report's list of Best High Schools, and #23 on their list of the nation's top schools in science, technology, engineering, and math (STEM).\"]], ['Staten Island Register', ['The Staten Island Register was a weekly newspaper serving the borough of Staten Island in New York City as an independent alternative to other news sources, including the \"Staten Island Advance\".', ' It began publication in 1966 under the ownership of the Sclafani family.', ' Joseph was the Owner.', ' The \"Staten Island Register\" was sold in August 2002 to Elauwit, LLC, a company formed by Daniel McDonough of New Jersey, was sold by McDonough to an investor in 2004, and ceased publication in December 2005.']], ['Staten Island Borough Hall', ['Staten Island Borough Hall is the primary municipal building for the borough of Staten Island in New York City.', ' It is located at 10 Richmond Terrace, next to the Richmond County Courthouse and opposite the Staten Island Ferry Terminal.', \" Staten Island Borough Hall houses the Borough President's office, offices of the Departments of Buildings and T\"]]], 'type': 'comparison', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "\u001b[32m2025-12-07 12:25:41.283\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a8f077b554299458435d526', 'answer': 'Netflix', 'question': '3 Arts Entertainment had at least one coalition with what streaming service, with its 13-episode, 2015 season of a Tina Fey created sitcom?', 'supporting_facts': [['3 Arts Entertainment', 0], ['3 Arts Entertainment', 1], ['Unbreakable Kimmy Schmidt', 0], ['Unbreakable Kimmy Schmidt', 1]], 'context': [['Louie (season 4)', ['The fourth season of the American television comedy series \"Louie\" premiered on May 5, 2014, and concluded on June 16, 2014.', ' It consists of fourteen episodes (an additional episode more than previous seasons), most running approximately 23 minutes in length.', ' FX broadcast the fourth season on Mondays at 10:00 and 10:30 pm in the United States with back-to-back episodes.', ' The season was produced by 3 Arts Entertainment and the executive producers were Louis C.K., Dave Becky and M. Blair Breard.']], ['3 Arts Entertainment', ['3 Arts Entertainment is a Beverly Hills–based talent management and television/film production company founded by Erwin Stoff in 1991 in preparation for producing the television show \"Down the Shore\".', ' The company is best known for producing comedic TV shows \"Parks and Recreation\", \"The Mindy Project\", \"Brooklyn Nine-Nine\", \"It\\'s Always Sunny in Philadelphia\", \"Unbreakable Kimmy Schmidt\" but also for blockbuster and dramatic films \"Edge of Tomorrow\", \"Unbroken\" and \"\".']], ['Louie (season 2)', ['The second season of the American television comedy series \"Louie\" premiered on June 23, 2011 and concluded on September 8, 2011.', ' It consisted of thirteen episodes, each running approximately 23 minutes in length.', ' FX broadcast the second season on Thursdays at 10:30 pm in the United States.', ' The season was produced by 3 Arts Entertainment and the executive producers were Louis C.K., Dave Becky and M. Blair Breard.', ' The second season was released on DVD and Blu-ray in region 1 on June 19, 2012.']], ['Louie (season 5)', ['The fifth season of the American television comedy series \"Louie\" premiered on April 9, 2015, and concluded on May 28, 2015.', ' It consists of eight episodes, each running approximately 23 minutes in length.', ' FX broadcast the fifth season on Thursdays at 10:30 pm in the United States.', ' The season was produced by 3 Arts Entertainment and the executive producers were Louis C.K., Dave Becky and M. Blair Breard.']], ['The Mindy Project', ['The Mindy Project is an American romantic comedy television series that premiered on Fox on September 25, 2012, and aired on Tuesday nights until March 24, 2015.', ' It then began airing on Hulu on September 15, 2015.', \" The series, created by Mindy Kaling (the series' star), is co-produced by Universal Television and 3 Arts Entertainment.\"]], ['List of 30 Rock episodes', ['\"30 Rock\" is an American satirical television sitcom that ran on NBC from October 11, 2006, to January 31, 2013.', ' Created by Tina Fey, the series follows the lives of the head writer of \"The Girlie Show with Tracy Jordan\" (TGS), Liz Lemon (Tina Fey), the other staff members of \"TGS\", and their network executive, Jack Donaghy (Alec Baldwin).', ' A total of 138 episodes of \"30 Rock\" were produced and aired over seven seasons.']], ['List of 30 Rock characters', ['\"30 Rock\" is an American television comedy series created by Tina Fey, which aired on NBC.', ' The series takes place behind the scenes of a fictional live sketch comedy series, also airing on NBC; the name \"30 Rock\" refers to the address of the GE Building, where NBC Studios is located (30 Rockefeller Plaza).', ' The series has an ensemble cast consisting of 14 regular cast members: Tina Fey, Alec Baldwin, Tracy Morgan, Jane Krakowski, Jack McBrayer, Scott Adsit, Judah Friedlander, Katrina Bowden, Keith Powell, Lonny Ross, John Lutz, Kevin Brown, Grizz Chapman, and Maulik Pancholy.']], ['Great News', ['Great News is an American sitcom television series created and written by Tracey Wigfield (her first series as a creator and producer), and co-executive produced with Tina Fey, Robert Carlock, and David Miner for 3 Arts Entertainment, Little Stranger and Universal Television.', ' The series premiered April 25, 2017 on NBC.']], ['Unbreakable Kimmy Schmidt', ['Unbreakable Kimmy Schmidt is an American television sitcom created by Tina Fey and Robert Carlock, starring Ellie Kemper in the title role, that has streamed on Netflix since March 6, 2015.', ' Originally set for a 13-episode first season on NBC for spring 2015, the show was sold to Netflix and given a two-season order.']], ['Cooter (30 Rock)', ['\"Cooter\" is the fifteenth episode of the second season of \"30 Rock\" and the thirty-sixth episode of the series.', \" It was written by series' creator Tina Fey and was directed by one of the season's producers, Don Scardino.\", ' The episode first aired on May 8, 2008, on the NBC network in the United States.', ' \"Cooter\" follows Jack Donaghy\\'s (Alec Baldwin) attempt to get fired from his new job in politics; Liz Lemon\\'s (Fey) pregnancy scare and decision to adopt a baby; Tracy Jordan\\'s (Tracy Morgan) creation of a pornographic video game; and Kenneth Parcell\\'s (Jack McBrayer) aspiration to be an NBC page at the Beijing Olympics.', ' The episode is an unofficial season finale, due to the season being shortened by the 2007–2008 Writers Guild of America strike.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 95%|█████████▍| 474/500 [04:40<00:54, 2.11s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:25:41.288\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a78abbc554299148911f90c', 'answer': 'Forbes ranked Schreiber Foods', 'question': 'Which bi-weekly publication ranked Schreiber Foods as number 81 in 2016.', 'supporting_facts': [['Schreiber Foods', 2], ['Forbes', 1]], 'context': [['List of newspapers in the Cayman Islands', ['The Cayman Islands is a group of three islands in the Caribbean Sea.', ' The first monthly publication on the islands was \"The Gospel of the Kingdom\", a religious themed newspaper founded in 1945.', ' In 1964, the newspaper \"Tradewinds\" began publication.', ' This was joined by the rival \"Caymanian Weekly\" in 1965.', ' This was followed by a second weekly publication, the \"Cayman Compass\", which started in 1972.', ' In 1974, the two weeklies merged to form the \"Caymanian Compass\".', ' This became a bi-weekly publication in 1976, appearing on Tuesdays and Fridays.']], ['Port Adelaide News', ['The Port Adelaide News was a newspaper published in Port Adelaide, South Australia between 1878 and 1933 with various sub-titles, several breaks in publication and several periods of bi-weekly publication.']], ['Forbes', ['Forbes ( ) is an American business magazine.', ' Published bi-weekly, it features original articles on finance, industry, investing, and marketing topics.', ' \"Forbes\" also reports on related subjects such as technology, communications, science, politics, and law.', ' Its headquarters is located in Jersey City, New Jersey.', ' Primary competitors in the national business magazine category include \"Fortune\" and \"Bloomberg Businessweek\".', \" The magazine is well known for its lists and rankings, including its lists of the richest Americans (the Forbes 400) and rankings of world's top companies (the Forbes Global 2000).\", \" Another well-known list by the magazine is The World's Billionaires list.\"]], ['Adelaide Morning Chronicle', ['The Adelaide Morning Chronicle was a newspaper published in Adelaide, South Australia during 1852 and 1853.', ' While claiming not to be a religious newspaper, the \"Adelaide Morning Chronicle\" was established by the draper Andrew Murray during the South Australian Parliament\\'s debate over separation of church and state.', ' Its intention was to provide a voice for the influential and conservative Anglican section of the Adelaide community.', ' This was in opposition to the opinions expressed by the non-conformist churches in their newspaper, the \"Austral Examiner\".', ' The newspaper was of a sufficient quality to also be seen as competition to the \"South Australian Register\".', ' Murray later worked for the \"Melbourne Argus\".', ' The newspaper was reduced to a bi-weekly publication (rather than daily) after 35 issues in early 1852, through the economic effects of the Victorian gold rush and ceased in early 1853.']], ['Massachusetts Register', ['The Massachusetts Register is the bi-weekly publication mandated by the Administrative Procedures Act (Massachusetts General Law Chapter 30A); it is an official organ of the Massachusetts state government.', ' The Register publishes new and amended regulations; notices of hearings and comment periods related to prospective or draft regulations; and a cumulative index of regulatory changes for the current year.', ' The Register also publishes notices of public interest, as well as opinions of the Attorney General and Executive Orders.', ' The Register is a printed publication; online subscription is also available.', ' This era of publication of the Massachusetts Register began in April 1976.']], ['Our Voice Today', ['Our Voice Today (\"OVT\") is a bi-weekly publication of NYSARC, Inc. Originally published as print newsletter Our Children\\'s Voice in March 1949, \"OVT\" has a long history of providing information and resources related to individuals with intellectual and other developmental disabilities.']], ['The Fashion Calendar', ['The Fashion Calendar is an American bi-weekly publication founded by Ruth Finley in 1945, listing all fashion related events in New York City.']], ['The Chronicle (Dominica)', ['The Chronicle is the national newspaper of the Caribbean island nation of Dominica.', ' It was begun by Bishop Philip Schelfhaut in 1909 as the Dominica Chronicle, a bi-weekly publication.', ' For many years afterward, it was known as \"The New Chronicle\" until it dropped the \"New\" from its title in 1996.']], ['Schreiber Foods', ['Schreiber Foods Inc., is a dairy company which produces and distributes natural cheese, processed cheese, cream cheese and yogurt.', ' It is an employee-owned customer brand dairy company headquartered in Green Bay, Wisconsin.', ' With more than $5 billion in annual sales, Forbes ranked Schreiber Foods as the 81st largest private employer in 2016.']], ['Port Plaza Mall', ['Port Plaza Mall (later known as Washington Commons) was an urban area shopping mall/multi-use facility located in downtown Green Bay, Wisconsin.', ' The mall opened on August 10, 1977, and featured 3 anchor stores over the years, with JCPenney and H.C. Prange open at its launch and Boston Store added by 1982.', ' The mall would go into a state of decline in the late 1990s and 2000s, leading up to its closure on February 27, 2006.', ' The mall property was razed during the 1st half of 2012 as part of a redevelopment project; the headquarters of Schreiber Foods now stands on the main mall footprint.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0.6153846153846153, 'em': 0.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:25:41.396\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5ae0850055429945ae959396', 'answer': 'James Taylor', 'question': 'Who was older, Andrew Preston or James Taylor?', 'supporting_facts': [['Isaac M. Taylor', 0], ['Isaac M. Taylor', 1], ['James Taylor', 0]], 'context': [['Andrew Preston (writer)', ['Andrew Preston is a Canadian historian, who won the 2013 Charles Taylor Prize for his book \"Sword of the Spirit, Shield of Faith: Religion in American War and Diplomacy\".', ' He is also a fellow at Clare College, Cambridge where he acts as a director of studies in history.']], ['Andrew Varley', ['Andrew Preston Varley (born December 2, 1934) was an American politician in the state of Iowa.']], ['James Taylor Quartet', ['The James Taylor Quartet (or JTQ) are a British four-piece jazz funk band, who have become renowned for their live performances.', \" They were formed in 1987 by Hammond organ player James Taylor following the break-up of his former band The Prisoners in the wake of Stiff Records' bankruptcy.\", ' The current line-up is James Taylor (Hammond organ), Chris Montague (guitar), Andrew McKinney (bass) and Adam Betts (drums), although recordings and live performances usually feature a horn section comprising John Willmott (tenor sax/flute) and Nick Smart (trumpet), and also vocalist Yvonne Yanney.']], ['Isaac M. Taylor', ['Isaac Montrose Taylor (June 15, 1921 – November 3, 1996) was the dean of the Medical School of the University of North Carolina from 1964 until 1971, and the father of James Taylor, the singer and guitarist, and four other children, Alex, Livingston, Hugh, and Kate.', ' Through his second marriage to Suzanne Francis Sheats, he fathered three more children, Andrew Preston (1983), Theodore Haynes (1986), and Julia Rose (1989).']], ['Kate Taylor (album)', [\"Kate Taylor is singer Kate Taylor's second album, released May 4, 1978.\", ' The album included Taylor\\'s sole chart single: her version of \"It\\'s in His Kiss (The Shoop Shoop Song)\", recorded in August 1977 to peak at number 49 that autumn; the \"Kate Taylor\" album also introduced the singer\\'s remakes of \"A Fool in Love\", \"It\\'s Growin\\'\" and \"Stubborn Kind of Woman\" (originally \"Stubborn Kind of Fellow\"); the track \"It\\'s Growin\\'\" was issued as a single in July 1978.', ' The album\\'s other tracks included the debut versions of two James Taylor compositions: \"Happy Birthday Sweet Darling\" and \"Slow and Steady\", and also Kate Taylor\\'s rendition of \"Rodeo\", composed by her brother Livingston Taylor for his 1973 album \"Over the Rainbow\".', ' \"Kate Taylor\" also included the B-side of \"It\\'s in His Kiss\": the self-penned \"Jason & Ida\", and introduced \"Tiah\\'s Cove\" — written by Kate Taylor\\'s husband Charlie Witham - and also the Walter Robinson composition \"Harriet Tubman\": the latter is described by James Taylor biographer Timothy White as \"a searing latterday spiritual\" which is \"the highpoint of Kate\\'s exceptional eleven song set.\"']], ['James Taylor', ['James Vernon Taylor (born March 12, 1948) is an American singer-songwriter and guitarist.', ' A five-time Grammy Award winner, he was inducted into the Rock and Roll Hall of Fame in 2000.', ' He is one of the best-selling music artists of all time, having sold more than 100 million records worldwide.']], ['Night Owl (James Taylor song)', ['\"Night Owl\" is a song written by James Taylor that was originally released as a single by Taylor\\'s band the Flying Machine, which also included Danny Kortchmar in 1967.', ' Taylor later rerecorded a solo version of the song for his Apple Records debut album \"James Taylor\" in 1968.', ' Subsequently the Flying Machine version was released on the album \"James Taylor and the Original Flying Machine\".', ' It has also been covered by such artists as Alex Taylor, Carly Simon and Anne Murray.']], ['Something in the Way She Moves', ['\"Something in the Way She Moves\" is a song written by James Taylor that appeared on his 1968 debut album for Apple Records, \"James Taylor\".', ' It has also been covered by other artists, including Tom Rush and Harry Belafonte.', ' The opening line inspired George Harrison to write the #1 Beatles\\' song \"Something.\"', \" According to James Taylor's stage banter at The Star in Frisco July 31, 2017, this was the song he played for Paul McCartney and George Harrison as an audition before signing with Apple Records.\"]], ['Highway Song (James Taylor song)', ['\"Highway Song\" is a song written by James Taylor.', ' It was first released by James Taylor\\'s brother Alex Taylor on his 1971 album \"With Friends and Neighbors\" and as the lead single from the album.', ' It was also released by James Taylor later that year on his album \"Mud Slide Slim and the Blue Horizon\".']], ['The Best of James Taylor', ['The Best of James Taylor is the fourth compilation album by American singer-songwriter James Taylor.', ' The album, a greatest hits collection, was released by Warner Bros.', ' Records in April 2003.', \" The same album was released in Europe as You've Got a Friend: The Best of James Taylor.\"]]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 95%|█████████▌| 477/500 [04:40<00:31, 1.35s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 12:25:41.407\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5ab8cd3d5542991b5579f009', 'answer': 'drawing the name out of a hat', 'question': \"How did the name of Peter Gifford's rock band (1980-1987) come to be chosen?\", 'supporting_facts': [['Peter Gifford', 1], ['Midnight Oil', 2]], 'context': [['The Atlantics', ['The Atlantics are an Australian surf rock band founded in 1961.', ' Initially, the band lineup consisted of drummer Peter Hood, bassist Bosco Bosanac, Theo Penglis on lead and rhythm guitar, and guitarist Eddy Matzenik.', ' Matzenik was replaced by Jim Skaithitis while the band was still in its earliest stages.', \" The band's claim to fame was as Australia's most successful of the genre.\", ' Most well known for their classic hit, \"Bombora\", their later recordings such as \"Come On\" are examples of 1960s garage rock.', ' They were the first Australian rock band to write their own hits.', ' In 2000 the group reformed with three of the original members, and continue to release new material and perform in concert.', ' In 2013 the group celebrated the 50th Anniversary of their first album, \"Bombora\" and the eponymous single that was their first to chart.', ' A European tour was organised to mark the occasion.']], [\"Can't Stand the Pain\", ['\"Can’t Stand the Pain\" is a song by the Bulgarian rock band “Sevi” and it was the first official single with featured video of Sevi band.', ' Written by Svetlana “Sevi” Bliznakova in her early years of composing, the song was originally recorded as a collaboration between Sevi and Peter Bratanov.', ' He was former guitarist of the alternative rock band \"Awake\" and year later, became a member of the band Sevi and the song was recorded as an official debut single and video of the band.', ' “Can’t Stand The Pain” turned out to be a very successful choice for Sevi.', ' It was liked by the Bulgarian fans of the band, and also it brought Sevi an international success.', ' Thanks to this song, they were chosen for \"Band of the week\" in Lima (Peru).']], ['...And Out Come the Wolves', ['...And Out Come the Wolves is the third studio album by the American punk rock band Rancid.', ' It was released on August 22, 1995, through Epitaph Records.', ' Rancid\\'s popularity and catchy songs made them the subject of a major label bidding war (hence the title, \"...And Out Come the Wolves\" taken from a poem in Jim Carroll\\'s The Basketball Diaries) that ended with the band staying on Epitaph.', \" With a sound heavily influenced by ska, which called to mind Tim Armstrong and Matt Freeman's past in Operation Ivy, Rancid became one of the few bands of the mid-to late-1990s boom in punk rock to retain much of its original fanbase.\", ' In terms of record sales and certifications, \"…And Out Come the Wolves\" is a popular album in the United States.', ' It produced three hit singles: \"Roots Radicals\", \"Time Bomb\" and \"Ruby Soho\", that earned Rancid its heaviest airplay on MTV and radio stations to date.', ' All the singles charted on Modern Rock Tracks. \"', '…And Out Come the Wolves\" was certified gold by the RIAA on January 22, 1996.', ' It was certified platinum on September 23, 2004.']], ['Midnight Oil', ['Midnight Oil (also known informally as \"The Oils\" to fans) are an Australian rock band, who originally performed as Farm from 1972 with drummer Rob Hirst, bass guitarist Andrew James and keyboard player/lead guitarist Jim Moginie.', ' While vocalist Peter Garrett was studying at Australian National University in Canberra, he answered an advertisement for a spot in Farm, and by 1975 the band was touring the east coast of Australia.', ' By late 1976, Garrett moved to Sydney to complete his law degree, and Farm changed its name to Midnight Oil by drawing the name out of a hat.']], ['Peter Gifford', ['Peter Gifford (born 5 April 1955), sometimes known as \"Giffo,\" is an Australian musician.', ' From 1980 until 1987 he played bass guitar, Chapman Stick and sang backing vocals for Australian rock band Midnight Oil.', ' His last recorded work with the oils was diesel and dust.', ' he did not tour this album and Bones Hillman was recruited to play bass and fill the hole in what was a raucous rhythm section (Bones remains the oils base man still).']], [\"The Chosen Few (1980's Australian Band)\", ['The Chosen Few were an Australian rock band active between 1985 to 1992.', ' Signed to Mushroom Records and managed by Stuart Coupe, the Chosen Few released four singles and a lone album \"Friends, Foes and Firewood\" (1990).', \" Despite constant national touring (including opening for major touring acts like Jerry Harrison, Cheap Trick and Billy Joel on their respective Australian tours) and receiving support from the country's most influential radio stations; chart success eluded the Chosen Few with only a cover version of the band's song 'Rise' becoming a hit for popular Australian singer Daryl Braithwaite.\", \" Braithwaite's version of 'Rise' is also the title track of his successful 1990 album, and remains a staple on Australian commercial radio.\"]], ['Kill Casino', ['Kill Casino is an English rock band, featuring Karen Luan on lead vocals and bass, Chris Ryan on guitar and Paul-Luc Gifford on drums.']], ['Manwomanchild', ['Manwomanchild is a rock band from Philadelphia, Pennsylvania fronted by David Child.', ' The band was started in Rhode Island in 2008.', ' In February 2010, the band release a self-titled EP recorded at Machines with Magnets in Pawtucket, RI.', \" The band's lineup for the EP and album consisted of David Child on vocals, guitars, and synths, Mason Neely on drums, and Craig Gifford on bass.\", ' In May 2010, Manwomanchild released the song \"Chile La Roja\" in support of Chile\\'s 2010 World Cup team.', ' The song was featured in 4 national newspapers and on 2 Chilean TV networks.']], ['Saving Forever', ['Saving Forever is an American pop rock band from South Chicago, Illinois made up of brothers Khaden (born 2004), Kye (born 2002) and Kavah Harris (born 2001).', ' The trio released \"Twenty 1\" followed by the single \"Million Ways\" in 2017 accompanied by a music video.', ' The sibling band comes from a very musical family.', ' It was picked as Elvis Duran\\'s Artist of the Month and was featured on NBC\\'s \"Today\" show hosted by Kathie Lee Gifford and Hoda Kotb and broadcast nationally where they performed live their single \"Million Ways\".']], ['Come On, Come In', ['\"Come On, Come In\" is a song by American hard rock band Velvet Revolver, featured on the soundtrack to the 2005 superhero film \"Fantastic Four\".', ' When released as a promotional single in the United States on June 21, 2005, the song reached number 14 on the American \"Billboard\" Hot Mainstream Rock Tracks chart.', ' The lyrics were written by vocalist Scott Weiland and the music was written by Weiland and the rest of the band; the song was produced by the band, Douglas Grean and Nick Raskulinecz.', ' The music video for \"Come On, Come In\" was directed by Wayne Isham.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 96%|█████████▌| 479/500 [04:46<00:36, 1.75s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:25:47.353\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a8aee3d55429950cd6afc01', 'answer': 'Clianthus', 'question': 'Which has more species, Clianthus or Callicoma?', 'supporting_facts': [['Clianthus', 0], ['Callicoma', 0]], 'context': [['Callicoma', ['Callicoma, is a plant genus that contains just one species, Callicoma serratifolia, a tall shrub or small tree which is native to Australia.', ' \"Callicoma serratifolia\" is commonly known as black wattle, derived from the similarity of the flowers to those of Australian \"Acacia\", which are commonly known as wattles.', ' The species has a number of other common names include callicoma, butterwood, silver leaf, silver-leaf butterwood and wild quince.']], ['Clianthus maximus', ['Clianthus maximus, commonly known as kaka beak (\"kōwhai ngutu-kākā\" in Māori), is a woody legume shrub native to New Zealand\\'s North Island.', ' It is one of two species of \"Clianthus\" (kaka beak) and both have striking clusters of red flowers which resemble the beak of the kākā, a New Zealand parrot.']], ['Aceria clianthi', ['Aceria clianthi is a species of mite belonging to the family Eriophyidae.', ' It is found only in New Zealand.', ' It is notable for being host specific to threatened plants of the genus \"Clianthus\".', ' It is classified by Buckley \"et al.\" as \"nationally critical\" under the New Zealand Threat Classification System.', ' They stated \"\"Aceria clianthi\" (Eriophyidae), has been recorded only from kakabeak (\"Clianthus\" spp.)', ' in cultivation and once on \"Lotus cornalatus\" [\"Lotus corniculatus\"] (Fabaceae), an introduced plant growing near kakabeak (Martin 2009).', ' It is given the same threat classification as kakabeak (de Lange et al. 2009).\"', ' Heenan had earlier stated that \"the two species [of \"Clianthus\"] are considered to be threatened, with \"C.\\xa0maximus\" having a rank of vulnerable, whereas \"C.\\xa0puniceus\" is critically endangered\", but the conservation status of \"C.\\xa0maximus\" was subsequently found to be more serious.', ' These threat classifications for \"Clianthus\" apply to plants in the wild, but the species are widely cultivated.', ' \"Aceria clianthi\" occurs on both plants in the wild and in cultivation.']], ['Clianthus', ['Clianthus, commonly known as kakabeak (\"Kōwhai ngutukākā\" in Māori), is a genus of flowering plants in the legume family Fabaceae, comprising two species of shrubs native to New Zealand.', ' They have striking clusters of red flowers which resemble the beak of the kākā, a New Zealand parrot.', \" The plants are also known as parrot's beak, parrot's bill and lobster claw - all references to the distinctive flowers.\", ' There is also a variety with white to creamy coloured flowers.']], ['Clianthus puniceus', ['Clianthus puniceus, common name kaka beak (\"Kōwhai Ngutu-kākā\" in Māori), is a species of flowering plant in the genus \"Clianthus\" of the legume family Fabaceae, native to New Zealand\\'s North Island.', ' It is an evergreen shrub, one of two species of \"Clianthus\", both of which have striking clusters of red flowers resembling the beak of the kākā, a New Zealand parrot.', \" The plant is also known as parrot's beak, parrot's bill and lobster claw.\", ' There is also a variety with white to creamy coloured flowers.']], ['Myrcianthes callicoma', ['Myrcianthes callicoma is a species of plant in the Myrtaceae family.', ' It is found in Argentina and Bolivia.']]], 'type': 'comparison', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 7 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 96%|█████████▌| 481/500 [04:46<00:24, 1.31s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 12:25:47.666\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a8afb9955429971feec45c2', 'answer': 'Disney California Adventure', 'question': 'The eighth-longest rollercoaster in the world is located in what theme park?', 'supporting_facts': [['Paradise Pier', 0], ['Paradise Pier', 2], [\"California Screamin'\", 1]], 'context': [['Theme Park Inc', ['Theme Park Inc. (also known as SimCoaster in the United States and Theme Park Manager in Australia) is a construction and management simulation video game.', ' It is the last game of the Theme Park series that started with \"Theme Park\" in 1994 and continued with \"Theme Park World\" in 1999.', ' \"Theme Park Inc.\" was developed by Bullfrog Productions and published by Electronic Arts.', \" It was the last game to bear the Bullfrog logo before the company's merger with EA UK in 2004.\"]], ['Castle Park (amusement park)', ['Castle Park, formerly Castle Amusement Park, is a 25-acre amusement park and family amusement center located in Riverside, California.', ' The park utilizes a medieval \"castle\" theme and includes attractions such as a miniature golf course, arcade, and 27 amusement rides including three roller coasters such as \"Merlin\\'s Revenge\", a junior rollercoaster, \"Screamin\\' Demon\" a spinning Wild Mouse rollercoaster, and \"Little Dipper\", a children\\'s rollercoaster.', ' The main \"castle\" themed building, houses the arcade as well as its only dark ride; \"Ghost Blasters\", an interactive attraction, designed by Sally Corporation, which can also be found at other amusement parks throughout North America.', \" The park was designed, built and operated by Bud Hurlbut, who designed several rides at Knott's Berry Farm.\", ' Castle Park is currently owned and operated by Palace Entertainment.']], [\"California Screamin'\", ['The boardwalk-themed launched roller coaster was designed by Ingenieur Büro Stengel GmbH and was built by Intamin.', \" California Screamin' is the eighth-longest rollercoaster in the world (and third-longest steel coaster in the United States), at 6072 ft long.\", ' It is also the longest ride with an inversion (since Son of Beast became defunct).', ' Its highest point is 120 ft followed by a 108 ft drop.', ' Though built of steel, the structure as designed visually mimics the features of a wooden coaster.']], ['Tokyo DisneySea', ['Tokyo DisneySea (東京ディズニーシー , Tōkyō DizunīShī ) is a 176 acre theme park at the Tokyo Disney Resort located in Urayasu, Chiba, Japan, just outside Tokyo.', ' It opened on 4 September 2001, at a cost of 335 billion yen.', ' Owned by The Oriental Land Company, which licenses Disney characters and themes from The Walt Disney Company, Tokyo DisneySea attracted an estimated 11 million visitors in 2016, making it the sixth-most-visited theme park in the world.', ' Tokyo DisneySea was the second theme park to open at the Tokyo Disney Resort and the ninth park of the twelve worldwide Disney theme parks to open.', ' Tokyo DisneySea was the fastest theme park in the world to reach the milestone of 10 million guests, having done so in 307 days after its grand opening.', ' The previous record-holder was Universal Studios Japan 338 days after its opening.']], ['RollerCoaster Tycoon World', ['RollerCoaster Tycoon World is a theme park construction and management simulation video game developed by Nvizzio Creations and published by Atari for Microsoft Windows.', ' It is the fourth major installment in the \"RollerCoaster Tycoon\" series.', ' The game was released on November 16, 2016.']], ['Paradise Pier', ['Paradise Pier is a themed land at Disney California Adventure, based on that of Victorian boardwalks that were once found along the coast of California.', ' Despite its name and the presence of a nearby man-made lake, Paradise Pier is not actually a pier, but merely a waterside area of the park.', \" The roller coaster California Screamin' sprawls across much of it, with various other attractions and forms of entertainment scattered around it.\"]], ['Pittsburgh Phantoms (RHI)', ['The Pittsburgh Phantoms were a professional roller hockey team based in Pittsburgh, Pennsylvania, United States that played in Roller Hockey International.', ' The team got its name from the \"Steel Phantom\" rollercoaster, located at Kennywood Park, a theme park located in the suburb of West Mifflin, Pennsylvania.', \" At the time of the team's inception the Phantom was the tallest and fastest steel rollercoaster in the world.\", \" The logo was heavily inspired by the roller coaster's logo seen at the entrance to the ride.\"]], [\"Gulliver's Kingdom\", [\"Gulliver's Kingdom (also known as Gulliver's Matlock Bath) is a theme park aimed at children aged 3 to 13 in the Derbyshire town of Matlock Bath, England.\", \" Founded in 1978 by Ray Phillips, it is still owned by the Phillips family and now has sister theme parks; Gulliver's World in Warrington and Gulliver's Land in Milton Keynes, which opened in 1989 and 1999 respectively.\", \" The park was originally created by Ray Phillips as a model village for his young children to enjoy, and it is close to the site of the Victorian Switchback rollercoaster ride (after which the theme park's main rollercoaster was named).\"]], ['Paultons Park', ['Paultons Family Theme Park | Home of Peppa Pig World is located in the village of Ower, near Romsey, in Hampshire, England.', ' The theme park has 70 rides and attractions.', ' The Peppa Pig World theme park area is based on the children’s television series character.', ' The Lost Kingdom theme park area includes 27 animatronic dinosaurs.', ' The park name is derived from the former Paultons Estate, on which the park is situated.', ' The park covers 140 acres of land and features a collection of around 80 species of birds and animals, in addition to the rides.', ' Most of the theme park rides are designed for children, which is why the park considers itself a family theme park.']], ['Theme Park World', ['Theme Park World, also known as Theme Park 2, and in North America as Sim Theme Park, is a 1999 construction and management simulation game developed by Bullfrog Productions and released by Electronic Arts.', ' The direct sequel to \"Theme Park\" (\"Theme Hospital\" and \"Theme Aquarium\" are thematic sequels), the player constructs and manages an amusement park with the aim of making profit and keeping visitors happy.', ' Initially developed for Windows, it was ported to PlayStation and PlayStation 2 (whose version was titled Theme Park Roller Coaster in North America), as well for Macintosh computers.', ' The Mac version was published by Feral Interactive.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "\u001b[32m2025-12-07 12:25:47.796\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 8 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 97%|█████████▋| 483/500 [04:47<00:16, 1.00it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:25:47.811\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 8 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:25:47.821\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 8 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:25:47.845\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 8 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:25:47.852\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 8 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:25:47.880\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 8 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 98%|█████████▊| 489/500 [04:53<00:11, 1.03s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:25:54.687\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a7fb5ce5542994857a767ca', 'answer': 'Robert \"Bumps\" Blackwell', 'question': \"Here's Little Richard is a debut album containing a song written by Little Richard, Enotris Johnson and who else?\", 'supporting_facts': [[\"Here's Little Richard\", 0], [\"Here's Little Richard\", 3], ['Long Tall Sally', 0]], 'context': [['Elvis Is Dead', ['\"Elvis Is Dead\" is a song by Living Colour featuring Little Richard and Maceo Parker off the album \"Time\\'s Up\".', ' Before, during, and after Little Richard\\'s guest rap performance, many voices speak the song title, concluded by one announcing, \"Elvis has left the building!\"', ' After, the band twisted the line \"Maybe I\\'ve a reason to believe we all will be received in Graceland\" from Paul Simon\\'s \"Graceland\" to yield the refrain, \"I\\'ve got a reason to believe we all won\\'t be received at Graceland.\"', ' They also quote Public Enemy\\'s \"Fight the Power\" in stating, \"Elvis was a hero to most,\" but diverge in adding, \"But that\\'s beside the point.\"']], ['The King of Rock and Roll', ['The King of Rock and Roll is Little Richard\\'s second album for Reprise Records, a follow-up album that contained one original Little Richard song, the gospel rock \"In the Name\" and a new song co-written by Producer H. B. Barnum, \"Green Power\", the single release; and versions of tracks by artists as diverse as Hank Williams, The Temptations, Martha and the Vandellas, Three Dog Night, and The Rolling Stones.', \" The title track, a mock braggadocio that referenced Tom Jones, Elvis Presley, Ike & Tina Turner, Sly and the Family Stone, and Aretha Franklin, amongst others, upset some fans, although the album's title tune got good airplay in New York - a 1950s style jump blues, with an exceptional Little Richard shouting vocal!\", ' But fans and critics were further upset that the album did not feature acoustic piano and that most tracks were badly mixed, with an intrusive girl group chorus.']], ['Rip It Up (Little Richard song)', ['\"Rip It Up\" is a song written by Robert Blackwell and John Marascalco.', ' It was first released by Little Richard in June, 1956.', ' Bill Haley and his Comets also released a recording of the song that year.', ' The Little Richard version hit number one on the R&B Best Sellers chart for two weeks and peaked at number 17 on the pop chart.', ' The Bill Haley and the Comets recording reached number twenty five on the \"Billboard\" pop singles chart and number four in the UK.', ' Bill Haley and the Comets also performed their version of the song in the 1956 film \"Don\\'t Knock the Rock\", in which Little Richard also appeared.']], [\"Here's Little Richard\", [\"Here's Little Richard is the debut album from Little Richard, released on March 1957.\", ' He had scored six Top 40 hits the previous year, some of which were included on this recording.', ' It was his highest charting album, at 13 on the \"Billboard\" Pop Albums chart.', ' The album contained two of Richard\\'s biggest hits, \"Long Tall Sally\", which reached #6 in the U.S. Pop charts, and \"Jenny, Jenny\", which reached #10 in the U.S. Pop charts.']], ['Jenny, Jenny', ['\"Jenny, Jenny\" is a 1957 song written by American musician Little Richard and Enotris Johnson and recorded and released by Little Richard.', ' It was featured on Penniman\\'s debut album, \"Here\\'s Little Richard\" and peaked at number ten on the US \"Billboard\" Hot 100 and reached number two on the Hot Rhythm and Blues Singles chart.']], ['Cliff (album)', ['Cliff Richard\\'s, debut album \"Cliff\" was released in April 1959 and reached No. 4 in the UK album chart.', ' A rock album, it was recorded live at Abbey Road Studios in February 1959 with The Shadows, then known as The Drifters, in front of an invited audience of 200 to 300 fans.', ' It features live recordings of Cliff\\'s own hit single \"Move It\" and both sides of the yet to be released Drifters\\' instrumental single \"Jet Black\"/\"Driftin\\'\" as well as a number of rock \\'n\\' roll standards, particularly of Elvis Presley songs, others include, Buddy Holly, Little Richard, Jerry Lee Lewis, Roy Orbison and Gene Vincent']], ['Ultimate Hits: Rock and Roll Never Forgets', ['Ultimate Hits: Rock and Roll Never Forgets is a compilation album by American rock singer–songwriter Bob Seger.', \" The double-disc album was released on November 21, 2011 and contains 26 remastered tracks from throughout Seger's career, which spans more than four decades.\", ' Included are the original mono version of \"Ramblin\\' Gamblin\\' Man\", Seger\\'s first hit with The Bob Seger System from 1968, the classic Christmas song \"The Little Drummer Boy\" from 1987\\'s \"A Very Special Christmas\", which makes its first appearance on a Seger album, and previously unreleased cover versions of Tom Waits\\' \"Downtown Train\" and Little Richard\\'s \"Hey, Hey, Hey, Hey (Going Back to Birmingham).\"', ' There is also a Walmart exclusive edition that includes the bonus track \"Living Inside My Heart,\" a song from the soundtrack of the 1986 film \"About Last Night...\", which has also never before been released on any Bob Seger album.', ' Two songs on this compilation album are edited compared to the original releases: \"We\\'ve Got Tonight\" is the single edit, which is about one minute shorter than the album version, and \"Katmandu\" is a newly edited version which omits the second verse, making the song also about one minute shorter compared to the original album version.', ' In the US it was certified gold and platinum in June 2013 by the RIAA.']], ['Long Tall Sally', ['\"Long Tall Sally\" is a rock and roll 12-bar blues song written by Robert \"Bumps\" Blackwell, Enotris Johnson, and Little Richard; recorded by Little Richard; and released in March 1956 on the Specialty Records label.']], ['Clumsy (Fergie song)', ['\"Clumsy\" is a song recorded by American singer and rapper Fergie for her debut studio album, \"The Dutchess\" (2006).', \" The song was released as the album's fifth single on September 25, 2007.\", ' It was written by Fergie, Bobby Troup and will.i.am, who also produced the track.', ' It was partially recorded in Los Angeles and in the John Lennon Educational Tour Bus.', ' \"Clumsy\" is a pop, bubblegum pop and R&B song.', ' The song\\'s lyrics about being clumsy and in love flow alongside its computerized and bleeping beat taken from \"The Bubble Bunch\" by Jimmy Spicer, as well as a sample of \"The Girl Can\\'t Help It\", originally performed by Little Richard.']], ['Tutti Frutti (song)', ['\"Tutti Frutti\" (meaning \"all fruits\" in Italian) is a song written by Little Richard along with Dorothy LaBostrie that was recorded in 1955 and became his first major hit record.', ' With its opening cry of \"A-wop-bop-a-loo-bop-a-wop-bam-boom!\"', ' (a verbal rendition of a drum pattern that Little Richard had imagined) and its hard-driving sound and wild lyrics, it became not only a model for many future Little Richard songs, but also a model for rock and roll itself.', \" The song introduced several of rock music's most characteristic musical features, including its loud volume and vocal style emphasizing power, and its distinctive beat and rhythm.\"]]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 98%|█████████▊| 491/500 [04:55<00:09, 1.02s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 98%|█████████▊| 492/500 [04:56<00:08, 1.08s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 99%|█████████▉| 494/500 [04:57<00:04, 1.31it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 100%|█████████▉| 498/500 [04:57<00:00, 2.26it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 100%|██████████| 500/500 [04:57<00:00, 1.68it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-07 12:25:58.445\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m13\u001b[0m - \u001b[1mEvaluation metrics (after optimization): {'f1': 0.5265256907907312, 'em': 0.3899782135076253, 'acc': 0.46405228758169936}\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "logger.info(\"Optimizing workflow...\")\n", "textgrad_optimizer.optimize(benchmark, seed=8)\n", "textgrad_optimizer.restore_best_graph()\n", "\n", "logger.info(\"Evaluating workflow on test set...\")\n", "with suppress_logger_info():\n", " results = textgrad_optimizer.evaluate(dataset=benchmark, eval_mode=\"test\")\n", "logger.info(f\"Evaluation metrics (after optimization): {results}\")" ] }, { "cell_type": "code", "execution_count": 7, "id": "4b6f274d", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "{'f1': 0.5265256907907312,\n", " 'em': 0.3899782135076253,\n", " 'acc': 0.46405228758169936}" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "results" ] }, { "cell_type": "markdown", "id": "108f60c5", "metadata": {}, "source": [ "# 4omini generated pipeline" ] }, { "cell_type": "code", "execution_count": 4, "id": "b463a4a5", "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "from dotenv import load_dotenv\n", "\n", "from evoagentx.agents.agent_manager import AgentManager\n", "from evoagentx.benchmark import HotPotQA\n", "from evoagentx.core.callbacks import suppress_logger_info\n", "from evoagentx.core.logging import logger\n", "from evoagentx.evaluators import Evaluator\n", "from evoagentx.models import OpenAILLM, OpenAILLMConfig\n", "from evoagentx.optimizers import TextGradOptimizer\n", "from evoagentx.prompts import StringTemplate\n", "from evoagentx.workflow import SequentialWorkFlowGraph\n", "from dotenv import load_dotenv\n", "\n", "from evoagentx.agents.agent_manager import AgentManager\n", "from evoagentx.benchmark import MBPP\n", "from evoagentx.core.callbacks import suppress_logger_info\n", "from evoagentx.core.logging import logger\n", "from evoagentx.evaluators import Evaluator\n", "from evoagentx.models import OpenAILLM, OpenAILLMConfig\n", "from evoagentx.optimizers import TextGradOptimizer\n", "from evoagentx.prompts import StringTemplate\n", "from evoagentx.workflow import SequentialWorkFlowGraph\n", "\n", "from evoagentx.models import OpenAILLMConfig, OpenAILLM\n", "from evoagentx.workflow import SEWWorkFlowGraph, STRUCTUREWorkFlowGraph\n", "from evoagentx.agents import AgentManager\n", "from evoagentx.benchmark import HumanEval,AFlowMBPP\n", "from evoagentx.evaluators import Evaluator \n", "from evoagentx.optimizers import SEWOptimizer, STRUCTUREOptimizer\n", "from evoagentx.optimizers.structure_optimizer import STRUCTUREWorkFlowScheme\n", "from evoagentx.core.callbacks import suppress_logger_info\n", "\n", "from evoagentx.models import OpenAILLMConfig, OpenAILLM,AzureOpenAIConfig,LiteLLMConfig,LiteLLM\n", "from evoagentx.workflow import SEWWorkFlowGraph \n", "from evoagentx.agents import AgentManager\n", "from evoagentx.benchmark import MBPPPLUS, AFlowMBPPPLUS\n", "from evoagentx.evaluators import Evaluator \n", "from evoagentx.optimizers import SEWOptimizer \n", "from evoagentx.core.callbacks import suppress_logger_info\n", "from evoagentx.benchmark import HumanEvalPLUS\n", "from evoagentx.benchmark import SciCode\n", "from copy import deepcopy\n", "\n", "import nest_asyncio\n", "nest_asyncio.apply()" ] }, { "cell_type": "code", "execution_count": 5, "id": "480de58c", "metadata": {}, "outputs": [], "source": [ "def collate_func(example: dict) -> dict:\n", " context_list = []\n", " paragraphs = example[\"context\"][\"contexts\"]\n", " context = \"\\n\".join(paragraphs)\n", " problem = \"Context: {}\\n\\nQuestion: {}\\n\\nAnswer:\".format(context, example[\"question\"])\n", " return {\"question\": problem, 'context':'context'}\n", "\n", "\n", "hotpotqa_graph_data = {\n", " \"goal\": \"Provide a concise answer to the question using relevant context. The answer must be straightforward and avoid unnecessary explanations.\",\n", " \"tasks\": [\n", " {\n", " \"name\": \"generate_answer\",\n", " \"description\": \"Extract and formulate an answer from the given context.\",\n", " \"inputs\": [\n", " {\"name\": \"question\", \"type\": \"str\", \"required\": True, \"description\": \"The question that needs to be answered.\"},\n", " {\"name\": \"context\", \"type\": \"str\", \"required\": True, \"description\": \"The background information pertinent to the question.\"}\n", " ],\n", " \"outputs\": [\n", " {\"name\": \"answer\", \"type\": \"str\", \"required\": True, \"description\": \"The direct answer to the question.\"}\n", " ],\n", " \"prompt_template\": StringTemplate(instruction=\"Use the context to determine the best answer to the question. Provide your final answer in a clear format, without extra commentary or reasoning.\"),\n", " \"parse_mode\": \"xml\"\n", " }\n", " ]\n", "}" ] }, { "cell_type": "code", "execution_count": 6, "id": "e9818857", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 7, "id": "796739dc", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2026-01-14 22:06:18.976\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.benchmark.hotpotqa\u001b[0m:\u001b[36m_load_data_from_file\u001b[0m:\u001b[36m51\u001b[0m - \u001b[1mloading HotPotQA data from /gpfs/radev/home/tl688/.evoagentx/data/hotpotqa/hotpot_train_v1.1.json ...\u001b[0m\n", "\u001b[32m2026-01-14 22:06:22.957\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.benchmark.hotpotqa\u001b[0m:\u001b[36m_load_data_from_file\u001b[0m:\u001b[36m51\u001b[0m - \u001b[1mloading HotPotQA data from /gpfs/radev/home/tl688/.evoagentx/data/hotpotqa/hotpot_dev_distractor_v1.json ...\u001b[0m\n" ] } ], "source": [ "benchmark = PubMedQASplits()\n", "workflow_graph = SequentialWorkFlowGraph.from_dict(hotpotqa_graph_data)\n", "agent_manager = AgentManager()\n", "agent_manager.add_agents_from_workflow(workflow_graph, executor_llm.config)\n", "\n", "evaluator = Evaluator(\n", " llm=executor_llm, \n", " agent_manager=agent_manager, \n", " collate_func=collate_func, \n", " num_workers=20, \n", " verbose=True\n", ")\n", "\n", "textgrad_optimizer = TextGradOptimizer(\n", " graph=workflow_graph, \n", " optimize_mode=\"all\",\n", " executor_llm=executor_llm, \n", " optimizer_llm=optimizer_llm,\n", " batch_size=3,\n", " max_steps=20,\n", " evaluator=evaluator,\n", " eval_every_n_steps=1,\n", " eval_rounds=1,\n", " save_interval=None,\n", " save_path=\"./\",\n", " rollback=True,\n", " constraints=[]\n", ")\n" ] }, { "cell_type": "code", "execution_count": 8, "id": "baa44bb7", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "7405" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(benchmark._fulldata)" ] }, { "cell_type": "code", "execution_count": 13, "id": "3ed1f571", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 19:30:03.447\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m1\u001b[0m - \u001b[1mEvaluating workflow on test set...\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 0%| | 1/500 [00:01<12:47, 1.54s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 1%| | 6/500 [00:01<01:41, 4.87it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.3333333333333333, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 2%|▏ | 10/500 [00:02<01:07, 7.30it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 2%|▏ | 12/500 [00:02<00:54, 8.93it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.75, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 3%|▎ | 16/500 [00:02<00:39, 12.21it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 4%|▍ | 19/500 [00:02<00:43, 11.17it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.18181818181818182, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 5%|▍ | 23/500 [00:03<00:52, 9.16it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 5%|▌ | 25/500 [00:03<01:02, 7.58it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.7272727272727273, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 6%|▌ | 29/500 [00:04<00:41, 11.29it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.3333333333333333, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 6%|▋ | 32/500 [00:04<00:38, 12.24it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.3333333333333333, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 7%|▋ | 34/500 [00:04<00:50, 9.28it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.4827586206896552, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 7%|▋ | 37/500 [00:04<00:49, 9.32it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 8%|▊ | 39/500 [00:05<00:50, 9.18it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.13333333333333333, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 9%|▊ | 43/500 [00:05<00:46, 9.92it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 9%|▉ | 45/500 [00:05<00:47, 9.62it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 10%|▉ | 48/500 [00:05<00:38, 11.82it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 10%|█ | 50/500 [00:06<00:53, 8.48it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 10%|█ | 52/500 [00:06<00:49, 9.00it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 11%|█▏ | 57/500 [00:06<00:40, 10.88it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 12%|█▏ | 59/500 [00:07<00:43, 10.14it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.08695652173913045, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 12%|█▏ | 61/500 [00:07<00:49, 8.89it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 13%|█▎ | 63/500 [00:07<00:49, 8.74it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 13%|█▎ | 65/500 [00:07<00:50, 8.60it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 14%|█▎ | 68/500 [00:08<00:45, 9.60it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 14%|█▍ | 72/500 [00:08<00:45, 9.40it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.30769230769230765, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 16%|█▌ | 78/500 [00:08<00:24, 17.50it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 16%|█▌ | 81/500 [01:00<34:30, 4.94s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 17%|█▋ | 85/500 [01:01<20:14, 2.93s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.3333333333333333, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 18%|█▊ | 91/500 [01:01<09:20, 1.37s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.1111111111111111, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 19%|█▉ | 94/500 [01:01<06:30, 1.04it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.25, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.10526315789473684, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 19%|█▉ | 97/500 [01:01<04:32, 1.48it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 20%|██ | 100/500 [01:02<03:50, 1.73it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 20%|██ | 102/500 [01:03<03:16, 2.02it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 22%|██▏ | 108/500 [01:03<01:39, 3.95it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.19999999999999998, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 22%|██▏ | 110/500 [01:03<01:26, 4.51it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.19999999999999998, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.4615384615384615, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 22%|██▏ | 112/500 [01:03<01:12, 5.38it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.9090909090909091, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 23%|██▎ | 114/500 [01:04<01:09, 5.52it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.16666666666666669, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 23%|██▎ | 116/500 [01:04<01:03, 6.05it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.4799999999999999, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 24%|██▍ | 120/500 [01:04<00:45, 8.35it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.3076923076923077, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 25%|██▍ | 123/500 [01:05<00:39, 9.49it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 25%|██▌ | 127/500 [01:05<00:44, 8.31it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 26%|██▌ | 130/500 [01:06<00:47, 7.72it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 26%|██▋ | 132/500 [01:06<00:38, 9.62it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 27%|██▋ | 136/500 [01:06<00:36, 10.06it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5185185185185185, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 28%|██▊ | 141/500 [01:06<00:22, 15.89it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 29%|██▊ | 143/500 [01:07<00:30, 11.89it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 29%|██▉ | 145/500 [01:07<00:31, 11.33it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.4444444444444444, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 29%|██▉ | 147/500 [01:07<00:33, 10.59it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 30%|██▉ | 149/500 [01:08<00:47, 7.33it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.21052631578947367, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 31%|███ | 154/500 [01:08<00:38, 9.07it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.3076923076923077, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 31%|███ | 156/500 [01:08<00:35, 9.70it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 32%|███▏ | 161/500 [01:08<00:27, 12.40it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.42857142857142855, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 33%|███▎ | 164/500 [01:09<00:31, 10.80it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 33%|███▎ | 166/500 [01:09<00:35, 9.32it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.17391304347826084, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 34%|███▍ | 169/500 [01:10<00:39, 8.35it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 34%|███▍ | 171/500 [01:10<00:49, 6.67it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 34%|███▍ | 172/500 [01:10<00:50, 6.54it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.7142857142857143, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 35%|███▍ | 173/500 [02:01<1:08:26, 12.56s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.13333333333333333, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 36%|███▌ | 179/500 [02:01<17:33, 3.28s/it] " ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.19999999999999998, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.15384615384615383, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 37%|███▋ | 184/500 [02:01<08:09, 1.55s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 37%|███▋ | 187/500 [02:01<05:24, 1.04s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.047619047619047616, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 38%|███▊ | 189/500 [02:02<04:37, 1.12it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 38%|███▊ | 191/500 [02:03<03:35, 1.43it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 39%|███▉ | 196/500 [02:03<01:50, 2.74it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.14285714285714288, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 40%|███▉ | 199/500 [02:03<01:19, 3.77it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 41%|████ | 203/500 [02:04<00:59, 4.99it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.09090909090909091, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 42%|████▏ | 208/500 [02:04<00:38, 7.58it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.4444444444444445, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 0.19999999999999998, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 42%|████▏ | 210/500 [02:05<00:41, 6.96it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 43%|████▎ | 215/500 [02:05<00:26, 10.80it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.7777777777777778, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 43%|████▎ | 217/500 [02:05<00:24, 11.69it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 44%|████▍ | 219/500 [02:05<00:27, 10.28it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 44%|████▍ | 221/500 [02:06<00:34, 8.12it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 45%|████▌ | 227/500 [02:06<00:25, 10.84it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 46%|████▌ | 229/500 [02:06<00:26, 10.21it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.21428571428571425, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.15384615384615385, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 46%|████▌ | 231/500 [02:07<00:31, 8.63it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6153846153846153, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 47%|████▋ | 233/500 [02:07<00:29, 9.12it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 48%|████▊ | 239/500 [02:07<00:22, 11.68it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.25, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.3636363636363636, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 49%|████▉ | 244/500 [02:08<00:24, 10.43it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 49%|████▉ | 246/500 [02:08<00:25, 9.80it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.25, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 50%|████▉ | 248/500 [02:08<00:26, 9.60it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 50%|█████ | 251/500 [02:09<00:30, 8.25it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 51%|█████ | 253/500 [02:09<00:29, 8.43it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 51%|█████▏ | 257/500 [02:09<00:26, 9.05it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5454545454545454, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 52%|█████▏ | 261/500 [02:10<00:19, 12.40it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.4444444444444445, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 53%|█████▎ | 263/500 [02:10<00:17, 13.39it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 53%|█████▎ | 265/500 [02:10<00:28, 8.12it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 53%|█████▎ | 267/500 [02:11<00:46, 5.05it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.061224489795918366, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 54%|█████▍ | 269/500 [03:01<25:32, 6.64s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8571428571428571, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 54%|█████▍ | 271/500 [03:01<16:36, 4.35s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 56%|█████▌ | 280/500 [03:01<04:16, 1.17s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.5625, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.11764705882352941, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 57%|█████▋ | 283/500 [03:02<03:03, 1.18it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 57%|█████▋ | 286/500 [03:03<02:32, 1.40it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 58%|█████▊ | 290/500 [03:03<01:37, 2.16it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 59%|█████▉ | 295/500 [03:03<00:51, 3.95it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.2222222222222222, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 59%|█████▉ | 297/500 [03:04<00:46, 4.35it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.3333333333333333, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 60%|██████ | 301/500 [03:04<00:35, 5.65it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 61%|██████ | 304/500 [03:05<00:31, 6.17it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.9090909090909091, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 61%|██████ | 306/500 [03:05<00:30, 6.30it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 62%|██████▏ | 308/500 [03:05<00:23, 8.14it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 62%|██████▏ | 310/500 [03:05<00:24, 7.88it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.4, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.3571428571428571, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 63%|██████▎ | 315/500 [03:06<00:18, 10.00it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 64%|██████▍ | 319/500 [03:06<00:23, 7.67it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 64%|██████▍ | 322/500 [03:07<00:17, 10.44it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 65%|██████▌ | 327/500 [03:07<00:12, 13.47it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.25, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 66%|██████▌ | 329/500 [03:07<00:11, 14.37it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 66%|██████▌ | 331/500 [03:07<00:16, 10.03it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.25, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 67%|██████▋ | 335/500 [03:08<00:20, 8.05it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 67%|██████▋ | 337/500 [03:08<00:18, 8.84it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5454545454545454, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 68%|██████▊ | 339/500 [03:08<00:18, 8.80it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.27118644067796605, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 69%|██████▊ | 343/500 [03:09<00:15, 9.90it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.3157894736842105, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 69%|██████▉ | 347/500 [03:09<00:18, 8.32it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.14285714285714285, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 70%|███████ | 352/500 [03:10<00:14, 10.11it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.3333333333333333, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.048780487804878044, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 71%|███████ | 354/500 [03:10<00:16, 8.98it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 72%|███████▏ | 358/500 [03:10<00:14, 10.06it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.4444444444444445, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 72%|███████▏ | 360/500 [03:11<00:15, 9.09it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.21052631578947367, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 72%|███████▏ | 361/500 [04:01<19:51, 8.57s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 73%|███████▎ | 365/500 [04:01<08:58, 3.99s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.75, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 74%|███████▍ | 371/500 [04:02<03:12, 1.50s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 0.05263157894736842, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 75%|███████▍ | 373/500 [04:02<02:28, 1.17s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 75%|███████▌ | 375/500 [04:02<01:55, 1.08it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6363636363636364, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 76%|███████▌ | 378/500 [04:03<01:14, 1.63it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 76%|███████▌ | 379/500 [04:03<01:03, 1.89it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 77%|███████▋ | 384/500 [04:03<00:26, 4.31it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.4444444444444445, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 77%|███████▋ | 386/500 [04:03<00:20, 5.58it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 78%|███████▊ | 389/500 [04:04<00:18, 6.03it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.4347826086956522, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 78%|███████▊ | 391/500 [04:05<00:22, 4.75it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 79%|███████▉ | 394/500 [04:05<00:19, 5.48it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.4444444444444445, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 80%|███████▉ | 398/500 [04:05<00:12, 8.43it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 80%|████████ | 400/500 [04:05<00:10, 9.13it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 80%|████████ | 402/500 [04:06<00:11, 8.75it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 81%|████████ | 404/500 [04:06<00:09, 9.90it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.24000000000000002, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 81%|████████ | 406/500 [04:06<00:09, 9.61it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.5454545454545454, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 82%|████████▏ | 409/500 [04:06<00:11, 7.89it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 83%|████████▎ | 413/500 [04:07<00:08, 9.92it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.09523809523809525, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.3333333333333333, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 84%|████████▍ | 420/500 [04:07<00:05, 14.27it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 84%|████████▍ | 422/500 [04:08<00:07, 11.02it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 85%|████████▍ | 424/500 [04:08<00:08, 8.46it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 85%|████████▌ | 426/500 [04:08<00:09, 7.46it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 86%|████████▌ | 431/500 [04:09<00:07, 9.59it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8571428571428571, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.2666666666666667, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 87%|████████▋ | 433/500 [04:09<00:07, 9.35it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 87%|████████▋ | 436/500 [04:09<00:05, 10.76it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.4210526315789474, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 88%|████████▊ | 440/500 [04:10<00:05, 10.26it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 88%|████████▊ | 442/500 [04:10<00:05, 10.38it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 89%|████████▉ | 444/500 [04:10<00:06, 8.68it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 89%|████████▉ | 446/500 [04:10<00:05, 9.22it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.4444444444444445, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.8235294117647058, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 90%|█████████ | 451/500 [04:11<00:04, 11.50it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5454545454545454, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6153846153846153, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 91%|█████████ | 455/500 [05:01<03:45, 5.02s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 91%|█████████▏| 457/500 [05:01<02:35, 3.61s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 92%|█████████▏| 462/500 [05:01<01:08, 1.80s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5263157894736842, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 93%|█████████▎| 466/500 [05:02<00:38, 1.13s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 94%|█████████▍| 471/500 [05:03<00:16, 1.71it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 95%|█████████▍| 473/500 [05:03<00:12, 2.11it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.47619047619047616, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 95%|█████████▌| 475/500 [05:03<00:10, 2.47it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 95%|█████████▌| 477/500 [05:03<00:07, 3.24it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 96%|█████████▌| 479/500 [05:04<00:05, 3.56it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.06060606060606061, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.4444444444444445, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 96%|█████████▋| 482/500 [05:04<00:03, 4.68it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 97%|█████████▋| 483/500 [05:05<00:04, 3.93it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 97%|█████████▋| 486/500 [05:05<00:02, 5.33it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 98%|█████████▊| 488/500 [05:05<00:01, 6.66it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.375, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 98%|█████████▊| 490/500 [05:05<00:01, 8.47it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 99%|█████████▉| 494/500 [05:06<00:00, 10.39it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 99%|█████████▉| 496/500 [05:06<00:00, 10.19it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 100%|█████████▉| 498/500 [05:07<00:00, 5.14it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 100%|█████████▉| 499/500 [05:07<00:00, 3.49it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 100%|██████████| 500/500 [05:08<00:00, 1.62it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.1, 'em': 0.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 19:35:11.871\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m4\u001b[0m - \u001b[1mEvaluation metrics (before optimization): {'f1': 0.6032480280405156, 'em': 0.428, 'acc': 0.632}\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "logger.info(\"Evaluating workflow on test set...\")\n", "with suppress_logger_info():\n", " results = textgrad_optimizer.evaluate(dataset=benchmark, eval_mode=\"test\")\n", "logger.info(f\"Evaluation metrics (before optimization): {results}\")" ] }, { "cell_type": "code", "execution_count": 14, "id": "0f33f493", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'f1': 0.6032480280405156, 'em': 0.428, 'acc': 0.632}" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "results" ] }, { "cell_type": "markdown", "id": "ad79982b", "metadata": {}, "source": [ "# textgrad 4o generated pipeline" ] }, { "cell_type": "code", "execution_count": 2, "id": "a67c5a76", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/site-packages/PyPDF2/__init__.py:21: DeprecationWarning: PyPDF2 is deprecated. Please move to the pypdf library instead.\n", " warnings.warn(\n" ] } ], "source": [ "import os\n", "\n", "from dotenv import load_dotenv\n", "\n", "from evoagentx.agents.agent_manager import AgentManager\n", "from evoagentx.benchmark import HotPotQA\n", "from evoagentx.core.callbacks import suppress_logger_info\n", "from evoagentx.core.logging import logger\n", "from evoagentx.evaluators import Evaluator\n", "from evoagentx.models import OpenAILLM, OpenAILLMConfig\n", "from evoagentx.optimizers import TextGradOptimizer\n", "from evoagentx.prompts import StringTemplate\n", "from evoagentx.workflow import SequentialWorkFlowGraph\n", "from dotenv import load_dotenv\n", "\n", "from evoagentx.agents.agent_manager import AgentManager\n", "from evoagentx.benchmark import MBPP\n", "from evoagentx.core.callbacks import suppress_logger_info\n", "from evoagentx.core.logging import logger\n", "from evoagentx.evaluators import Evaluator\n", "from evoagentx.models import OpenAILLM, OpenAILLMConfig\n", "from evoagentx.optimizers import TextGradOptimizer\n", "from evoagentx.prompts import StringTemplate\n", "from evoagentx.workflow import SequentialWorkFlowGraph\n", "\n", "from evoagentx.models import OpenAILLMConfig, OpenAILLM\n", "from evoagentx.workflow import SEWWorkFlowGraph, STRUCTUREWorkFlowGraph\n", "from evoagentx.agents import AgentManager\n", "from evoagentx.benchmark import HumanEval,AFlowMBPP\n", "from evoagentx.evaluators import Evaluator \n", "from evoagentx.optimizers import SEWOptimizer, STRUCTUREOptimizer\n", "from evoagentx.optimizers.structure_optimizer import STRUCTUREWorkFlowScheme\n", "from evoagentx.core.callbacks import suppress_logger_info\n", "\n", "from evoagentx.models import OpenAILLMConfig, OpenAILLM,AzureOpenAIConfig,LiteLLMConfig,LiteLLM\n", "from evoagentx.workflow import SEWWorkFlowGraph \n", "from evoagentx.agents import AgentManager\n", "from evoagentx.benchmark import MBPPPLUS, AFlowMBPPPLUS\n", "from evoagentx.evaluators import Evaluator \n", "from evoagentx.optimizers import SEWOptimizer \n", "from evoagentx.core.callbacks import suppress_logger_info\n", "from evoagentx.benchmark import HumanEvalPLUS\n", "from evoagentx.benchmark import SciCode\n", "from copy import deepcopy\n", "\n", "import nest_asyncio\n", "nest_asyncio.apply()" ] }, { "cell_type": "code", "execution_count": 3, "id": "13cee705", "metadata": {}, "outputs": [], "source": [ "def collate_func(example: dict) -> dict:\n", " context_list = []\n", " paragraphs = example[\"context\"][\"contexts\"]\n", " context = \"\\n\".join(paragraphs)\n", " problem = \"Context: {}\\n\\nQuestion: {}\\n\\nAnswer:\".format(context, example[\"question\"])\n", " return {\"problem\": problem}\n", "\n", "\n", "hotpotqa_graph_data = {\n", " \"goal\": \"Answer the question based on the context. The answer should be a direct response to the question, without including explanations or reasoning.\",\n", " \"tasks\": [\n", " {\n", " \"name\": \"answer_generate\",\n", " \"description\": \"Answer the question based on the context.\",\n", " \"inputs\": [\n", " {\"name\": \"problem\", \"type\": \"str\", \"required\": True, \"description\": \"The problem to solve.\"}\n", " ],\n", " \"outputs\": [\n", " {\"name\": \"answer\", \"type\": \"str\", \"required\": True, \"description\": \"The answer to the problem.\"}\n", " ],\n", " \"prompt_template\": StringTemplate(instruction=\"Think step by step to answer the question. You should explain your thinking process in the 'thought' field, and provide the final answer in the 'answer' field.\\nFormat your output in xml format, such as xxx and xxx.\"),\n", " \"parse_mode\": \"xml\"\n", " }\n", " ] \n", "}" ] }, { "cell_type": "code", "execution_count": 7, "id": "4a4ff07a", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2026-01-14 22:06:18.976\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.benchmark.hotpotqa\u001b[0m:\u001b[36m_load_data_from_file\u001b[0m:\u001b[36m51\u001b[0m - \u001b[1mloading HotPotQA data from /gpfs/radev/home/tl688/.evoagentx/data/hotpotqa/hotpot_train_v1.1.json ...\u001b[0m\n", "\u001b[32m2026-01-14 22:06:22.957\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.benchmark.hotpotqa\u001b[0m:\u001b[36m_load_data_from_file\u001b[0m:\u001b[36m51\u001b[0m - \u001b[1mloading HotPotQA data from /gpfs/radev/home/tl688/.evoagentx/data/hotpotqa/hotpot_dev_distractor_v1.json ...\u001b[0m\n" ] } ], "source": [ "benchmark = PubMedQASplits()\n", "workflow_graph = SequentialWorkFlowGraph.from_dict(hotpotqa_graph_data)\n", "agent_manager = AgentManager()\n", "agent_manager.add_agents_from_workflow(workflow_graph, executor_llm.config)\n", "\n", "evaluator = Evaluator(\n", " llm=executor_llm, \n", " agent_manager=agent_manager, \n", " collate_func=collate_func, \n", " num_workers=20, \n", " verbose=True\n", ")\n", "\n", "textgrad_optimizer = TextGradOptimizer(\n", " graph=workflow_graph, \n", " optimize_mode=\"all\",\n", " executor_llm=executor_llm, \n", " optimizer_llm=optimizer_llm,\n", " batch_size=3,\n", " max_steps=20,\n", " evaluator=evaluator,\n", " eval_every_n_steps=1,\n", " eval_rounds=1,\n", " save_interval=None,\n", " save_path=\"./\",\n", " rollback=True,\n", " constraints=[]\n", ")\n" ] }, { "cell_type": "code", "execution_count": 8, "id": "c53d6512", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "7405" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(benchmark._fulldata)" ] }, { "cell_type": "code", "execution_count": 13, "id": "29a6330e", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-07 19:30:03.447\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m1\u001b[0m - \u001b[1mEvaluating workflow on test set...\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 0%| | 1/500 [00:01<12:47, 1.54s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 1%| | 6/500 [00:01<01:41, 4.87it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.3333333333333333, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 2%|▏ | 10/500 [00:02<01:07, 7.30it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 2%|▏ | 12/500 [00:02<00:54, 8.93it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.75, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 3%|▎ | 16/500 [00:02<00:39, 12.21it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 4%|▍ | 19/500 [00:02<00:43, 11.17it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.18181818181818182, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 5%|▍ | 23/500 [00:03<00:52, 9.16it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 5%|▌ | 25/500 [00:03<01:02, 7.58it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.7272727272727273, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 6%|▌ | 29/500 [00:04<00:41, 11.29it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.3333333333333333, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 6%|▋ | 32/500 [00:04<00:38, 12.24it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.3333333333333333, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 7%|▋ | 34/500 [00:04<00:50, 9.28it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.4827586206896552, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 7%|▋ | 37/500 [00:04<00:49, 9.32it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 8%|▊ | 39/500 [00:05<00:50, 9.18it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.13333333333333333, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 9%|▊ | 43/500 [00:05<00:46, 9.92it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 9%|▉ | 45/500 [00:05<00:47, 9.62it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 10%|▉ | 48/500 [00:05<00:38, 11.82it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 10%|█ | 50/500 [00:06<00:53, 8.48it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 10%|█ | 52/500 [00:06<00:49, 9.00it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 11%|█▏ | 57/500 [00:06<00:40, 10.88it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 12%|█▏ | 59/500 [00:07<00:43, 10.14it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.08695652173913045, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 12%|█▏ | 61/500 [00:07<00:49, 8.89it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 13%|█▎ | 63/500 [00:07<00:49, 8.74it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 13%|█▎ | 65/500 [00:07<00:50, 8.60it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 14%|█▎ | 68/500 [00:08<00:45, 9.60it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 14%|█▍ | 72/500 [00:08<00:45, 9.40it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.30769230769230765, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 16%|█▌ | 78/500 [00:08<00:24, 17.50it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 16%|█▌ | 81/500 [01:00<34:30, 4.94s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 17%|█▋ | 85/500 [01:01<20:14, 2.93s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.3333333333333333, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 18%|█▊ | 91/500 [01:01<09:20, 1.37s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.1111111111111111, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 19%|█▉ | 94/500 [01:01<06:30, 1.04it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.25, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.10526315789473684, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 19%|█▉ | 97/500 [01:01<04:32, 1.48it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 20%|██ | 100/500 [01:02<03:50, 1.73it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 20%|██ | 102/500 [01:03<03:16, 2.02it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 22%|██▏ | 108/500 [01:03<01:39, 3.95it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.19999999999999998, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 22%|██▏ | 110/500 [01:03<01:26, 4.51it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.19999999999999998, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.4615384615384615, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 22%|██▏ | 112/500 [01:03<01:12, 5.38it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.9090909090909091, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 23%|██▎ | 114/500 [01:04<01:09, 5.52it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.16666666666666669, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 23%|██▎ | 116/500 [01:04<01:03, 6.05it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.4799999999999999, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 24%|██▍ | 120/500 [01:04<00:45, 8.35it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.3076923076923077, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 25%|██▍ | 123/500 [01:05<00:39, 9.49it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 25%|██▌ | 127/500 [01:05<00:44, 8.31it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 26%|██▌ | 130/500 [01:06<00:47, 7.72it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 26%|██▋ | 132/500 [01:06<00:38, 9.62it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 27%|██▋ | 136/500 [01:06<00:36, 10.06it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5185185185185185, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 28%|██▊ | 141/500 [01:06<00:22, 15.89it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 29%|██▊ | 143/500 [01:07<00:30, 11.89it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 29%|██▉ | 145/500 [01:07<00:31, 11.33it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.4444444444444444, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 29%|██▉ | 147/500 [01:07<00:33, 10.59it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 30%|██▉ | 149/500 [01:08<00:47, 7.33it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.21052631578947367, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 31%|███ | 154/500 [01:08<00:38, 9.07it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.3076923076923077, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 31%|███ | 156/500 [01:08<00:35, 9.70it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 32%|███▏ | 161/500 [01:08<00:27, 12.40it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.42857142857142855, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 33%|███▎ | 164/500 [01:09<00:31, 10.80it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 33%|███▎ | 166/500 [01:09<00:35, 9.32it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.17391304347826084, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 34%|███▍ | 169/500 [01:10<00:39, 8.35it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 34%|███▍ | 171/500 [01:10<00:49, 6.67it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 34%|███▍ | 172/500 [01:10<00:50, 6.54it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.7142857142857143, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 35%|███▍ | 173/500 [02:01<1:08:26, 12.56s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.13333333333333333, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 36%|███▌ | 179/500 [02:01<17:33, 3.28s/it] " ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.19999999999999998, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.15384615384615383, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 37%|███▋ | 184/500 [02:01<08:09, 1.55s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 37%|███▋ | 187/500 [02:01<05:24, 1.04s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.047619047619047616, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 38%|███▊ | 189/500 [02:02<04:37, 1.12it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 38%|███▊ | 191/500 [02:03<03:35, 1.43it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 39%|███▉ | 196/500 [02:03<01:50, 2.74it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.14285714285714288, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 40%|███▉ | 199/500 [02:03<01:19, 3.77it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 41%|████ | 203/500 [02:04<00:59, 4.99it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.09090909090909091, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 42%|████▏ | 208/500 [02:04<00:38, 7.58it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.4444444444444445, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 0.19999999999999998, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 42%|████▏ | 210/500 [02:05<00:41, 6.96it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 43%|████▎ | 215/500 [02:05<00:26, 10.80it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.7777777777777778, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 43%|████▎ | 217/500 [02:05<00:24, 11.69it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 44%|████▍ | 219/500 [02:05<00:27, 10.28it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 44%|████▍ | 221/500 [02:06<00:34, 8.12it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 45%|████▌ | 227/500 [02:06<00:25, 10.84it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 46%|████▌ | 229/500 [02:06<00:26, 10.21it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.21428571428571425, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.15384615384615385, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 46%|████▌ | 231/500 [02:07<00:31, 8.63it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6153846153846153, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 47%|████▋ | 233/500 [02:07<00:29, 9.12it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 48%|████▊ | 239/500 [02:07<00:22, 11.68it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.25, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.3636363636363636, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 49%|████▉ | 244/500 [02:08<00:24, 10.43it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 49%|████▉ | 246/500 [02:08<00:25, 9.80it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.25, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 50%|████▉ | 248/500 [02:08<00:26, 9.60it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 50%|█████ | 251/500 [02:09<00:30, 8.25it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 51%|█████ | 253/500 [02:09<00:29, 8.43it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 51%|█████▏ | 257/500 [02:09<00:26, 9.05it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5454545454545454, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 52%|█████▏ | 261/500 [02:10<00:19, 12.40it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.4444444444444445, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 53%|█████▎ | 263/500 [02:10<00:17, 13.39it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 53%|█████▎ | 265/500 [02:10<00:28, 8.12it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 53%|█████▎ | 267/500 [02:11<00:46, 5.05it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.061224489795918366, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 54%|█████▍ | 269/500 [03:01<25:32, 6.64s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8571428571428571, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 54%|█████▍ | 271/500 [03:01<16:36, 4.35s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 56%|█████▌ | 280/500 [03:01<04:16, 1.17s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.5625, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.11764705882352941, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 57%|█████▋ | 283/500 [03:02<03:03, 1.18it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 57%|█████▋ | 286/500 [03:03<02:32, 1.40it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 58%|█████▊ | 290/500 [03:03<01:37, 2.16it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 59%|█████▉ | 295/500 [03:03<00:51, 3.95it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.2222222222222222, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 59%|█████▉ | 297/500 [03:04<00:46, 4.35it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.3333333333333333, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 60%|██████ | 301/500 [03:04<00:35, 5.65it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 61%|██████ | 304/500 [03:05<00:31, 6.17it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.9090909090909091, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 61%|██████ | 306/500 [03:05<00:30, 6.30it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 62%|██████▏ | 308/500 [03:05<00:23, 8.14it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 62%|██████▏ | 310/500 [03:05<00:24, 7.88it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.4, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.3571428571428571, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 63%|██████▎ | 315/500 [03:06<00:18, 10.00it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 64%|██████▍ | 319/500 [03:06<00:23, 7.67it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 64%|██████▍ | 322/500 [03:07<00:17, 10.44it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 65%|██████▌ | 327/500 [03:07<00:12, 13.47it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.25, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 66%|██████▌ | 329/500 [03:07<00:11, 14.37it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 66%|██████▌ | 331/500 [03:07<00:16, 10.03it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.25, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 67%|██████▋ | 335/500 [03:08<00:20, 8.05it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 67%|██████▋ | 337/500 [03:08<00:18, 8.84it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5454545454545454, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 68%|██████▊ | 339/500 [03:08<00:18, 8.80it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.27118644067796605, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 69%|██████▊ | 343/500 [03:09<00:15, 9.90it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.3157894736842105, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 69%|██████▉ | 347/500 [03:09<00:18, 8.32it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.14285714285714285, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 70%|███████ | 352/500 [03:10<00:14, 10.11it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.3333333333333333, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.048780487804878044, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 71%|███████ | 354/500 [03:10<00:16, 8.98it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 72%|███████▏ | 358/500 [03:10<00:14, 10.06it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.4444444444444445, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 72%|███████▏ | 360/500 [03:11<00:15, 9.09it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.21052631578947367, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 72%|███████▏ | 361/500 [04:01<19:51, 8.57s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 73%|███████▎ | 365/500 [04:01<08:58, 3.99s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.75, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 74%|███████▍ | 371/500 [04:02<03:12, 1.50s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 0.05263157894736842, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 75%|███████▍ | 373/500 [04:02<02:28, 1.17s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 75%|███████▌ | 375/500 [04:02<01:55, 1.08it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6363636363636364, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 76%|███████▌ | 378/500 [04:03<01:14, 1.63it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 76%|███████▌ | 379/500 [04:03<01:03, 1.89it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 77%|███████▋ | 384/500 [04:03<00:26, 4.31it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.4444444444444445, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 77%|███████▋ | 386/500 [04:03<00:20, 5.58it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 78%|███████▊ | 389/500 [04:04<00:18, 6.03it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.4347826086956522, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 78%|███████▊ | 391/500 [04:05<00:22, 4.75it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 79%|███████▉ | 394/500 [04:05<00:19, 5.48it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.4444444444444445, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 80%|███████▉ | 398/500 [04:05<00:12, 8.43it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 80%|████████ | 400/500 [04:05<00:10, 9.13it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 80%|████████ | 402/500 [04:06<00:11, 8.75it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 81%|████████ | 404/500 [04:06<00:09, 9.90it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.24000000000000002, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 81%|████████ | 406/500 [04:06<00:09, 9.61it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.5454545454545454, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 82%|████████▏ | 409/500 [04:06<00:11, 7.89it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 83%|████████▎ | 413/500 [04:07<00:08, 9.92it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.09523809523809525, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.3333333333333333, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 84%|████████▍ | 420/500 [04:07<00:05, 14.27it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 84%|████████▍ | 422/500 [04:08<00:07, 11.02it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 85%|████████▍ | 424/500 [04:08<00:08, 8.46it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 85%|████████▌ | 426/500 [04:08<00:09, 7.46it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 86%|████████▌ | 431/500 [04:09<00:07, 9.59it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8571428571428571, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.2666666666666667, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 87%|████████▋ | 433/500 [04:09<00:07, 9.35it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 87%|████████▋ | 436/500 [04:09<00:05, 10.76it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.4210526315789474, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 88%|████████▊ | 440/500 [04:10<00:05, 10.26it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 88%|████████▊ | 442/500 [04:10<00:05, 10.38it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 89%|████████▉ | 444/500 [04:10<00:06, 8.68it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 89%|████████▉ | 446/500 [04:10<00:05, 9.22it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.4444444444444445, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.8235294117647058, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 90%|█████████ | 451/500 [04:11<00:04, 11.50it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5454545454545454, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6153846153846153, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 91%|█████████ | 455/500 [05:01<03:45, 5.02s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 91%|█████████▏| 457/500 [05:01<02:35, 3.61s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 92%|█████████▏| 462/500 [05:01<01:08, 1.80s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5263157894736842, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 93%|█████████▎| 466/500 [05:02<00:38, 1.13s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 94%|█████████▍| 471/500 [05:03<00:16, 1.71it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 95%|█████████▍| 473/500 [05:03<00:12, 2.11it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.47619047619047616, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 95%|█████████▌| 475/500 [05:03<00:10, 2.47it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 95%|█████████▌| 477/500 [05:03<00:07, 3.24it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 96%|█████████▌| 479/500 [05:04<00:05, 3.56it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.06060606060606061, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.4444444444444445, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 96%|█████████▋| 482/500 [05:04<00:03, 4.68it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 97%|█████████▋| 483/500 [05:05<00:04, 3.93it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 97%|█████████▋| 486/500 [05:05<00:02, 5.33it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 98%|█████████▊| 488/500 [05:05<00:01, 6.66it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.375, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 98%|█████████▊| 490/500 [05:05<00:01, 8.47it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 99%|█████████▉| 494/500 [05:06<00:00, 10.39it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 99%|█████████▉| 496/500 [05:06<00:00, 10.19it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 100%|█████████▉| 498/500 [05:07<00:00, 5.14it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 100%|█████████▉| 499/500 [05:07<00:00, 3.49it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 100%|██████████| 500/500 [05:08<00:00, 1.62it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.1, 'em': 0.0, 'acc': 1.0}\n", "\u001b[32m2025-12-07 19:35:11.871\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m4\u001b[0m - \u001b[1mEvaluation metrics (before optimization): {'f1': 0.6032480280405156, 'em': 0.428, 'acc': 0.632}\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "logger.info(\"Evaluating workflow on test set...\")\n", "with suppress_logger_info():\n", " results = textgrad_optimizer.evaluate(dataset=benchmark, eval_mode=\"test\")\n", "logger.info(f\"Evaluation metrics (before optimization): {results}\")" ] }, { "cell_type": "code", "execution_count": 14, "id": "364a5707", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'f1': 0.6032480280405156, 'em': 0.428, 'acc': 0.632}" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "results" ] }, { "cell_type": "markdown", "id": "4ccff677", "metadata": {}, "source": [ "# sew" ] }, { "cell_type": "code", "execution_count": 1, "id": "4c2a6fa7", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/site-packages/PyPDF2/__init__.py:21: DeprecationWarning: PyPDF2 is deprecated. Please move to the pypdf library instead.\n", " warnings.warn(\n" ] } ], "source": [ "import os\n", "\n", "from dotenv import load_dotenv\n", "\n", "from evoagentx.agents.agent_manager import AgentManager\n", "from evoagentx.benchmark import HotPotQA\n", "from evoagentx.core.callbacks import suppress_logger_info\n", "from evoagentx.core.logging import logger\n", "from evoagentx.evaluators import Evaluator\n", "from evoagentx.models import OpenAILLM, OpenAILLMConfig\n", "from evoagentx.optimizers import TextGradOptimizer\n", "from evoagentx.prompts import StringTemplate\n", "from evoagentx.workflow import SequentialWorkFlowGraph\n", "from dotenv import load_dotenv\n", "\n", "from evoagentx.agents.agent_manager import AgentManager\n", "from evoagentx.benchmark import MBPP\n", "from evoagentx.core.callbacks import suppress_logger_info\n", "from evoagentx.core.logging import logger\n", "from evoagentx.evaluators import Evaluator\n", "from evoagentx.models import OpenAILLM, OpenAILLMConfig\n", "from evoagentx.optimizers import TextGradOptimizer\n", "from evoagentx.prompts import StringTemplate\n", "from evoagentx.workflow import SequentialWorkFlowGraph\n", "\n", "from evoagentx.models import OpenAILLMConfig, OpenAILLM\n", "from evoagentx.workflow import SEWWorkFlowGraph, STRUCTUREWorkFlowGraph\n", "from evoagentx.agents import AgentManager\n", "from evoagentx.benchmark import HumanEval,AFlowMBPP\n", "from evoagentx.evaluators import Evaluator \n", "from evoagentx.optimizers import SEWOptimizer, STRUCTUREOptimizer\n", "from evoagentx.optimizers.structure_optimizer import STRUCTUREWorkFlowScheme\n", "from evoagentx.core.callbacks import suppress_logger_info\n", "\n", "from evoagentx.models import OpenAILLMConfig, OpenAILLM,AzureOpenAIConfig,LiteLLMConfig,LiteLLM\n", "from evoagentx.workflow import SEWWorkFlowGraph \n", "from evoagentx.agents import AgentManager\n", "from evoagentx.benchmark import MBPPPLUS, AFlowMBPPPLUS\n", "from evoagentx.evaluators import Evaluator \n", "from evoagentx.optimizers import SEWOptimizer \n", "from evoagentx.core.callbacks import suppress_logger_info\n", "from evoagentx.benchmark import HumanEvalPLUS\n", "from evoagentx.benchmark import SciCode\n", "from copy import deepcopy\n", "\n", "import nest_asyncio\n", "nest_asyncio.apply()\n", "\n", "hotpotqa_graph_data = {\n", " \"goal\": \"Answer the question based on the context. The answer should be a direct response to the question, without including explanations or reasoning.\",\n", " \"tasks\": [\n", " {\n", " \"name\": \"answer_generate\",\n", " \"description\": \"Answer the question based on the context.\",\n", " \"inputs\": [\n", " {\"name\": \"problem\", \"type\": \"str\", \"required\": True, \"description\": \"The problem to solve.\"}\n", " ],\n", " \"outputs\": [\n", " {\"name\": \"answer\", \"type\": \"str\", \"required\": True, \"description\": \"The answer to the problem.\"}\n", " ],\n", " \"prompt_template\": StringTemplate(instruction=\"Think step by step to answer the question. You should explain your thinking process in the 'thought' field, and provide the final answer in the 'answer' field.\\nFormat your output in xml format, such as xxx and xxx.\"),\n", " \"parse_mode\": \"xml\"\n", " }\n", " ] \n", "}\n", "\n", "os.environ[\"AZURE_OPENAI_DEPLOYMENT_NAME\"] = \"gpt-4o-mini\"\n", "os.environ[\"AZURE_OPENAI_ENDPOINT\"] = \"\"\n", "os.environ[\"AZURE_OPENAI_KEY\"] = \"\"\n", "os.environ[\"AZURE_OPENAI_API_VERSION\"] = \"2025-01-01-preview\"\n", "llm_config = LiteLLMConfig(model=\"azure/\" + os.getenv(\"AZURE_OPENAI_DEPLOYMENT_NAME\"), # Azure model format\n", " azure_endpoint=os.getenv(\"AZURE_OPENAI_ENDPOINT\"),\n", " azure_key=os.getenv(\"AZURE_OPENAI_KEY\"),\n", " api_version=os.getenv(\"AZURE_OPENAI_API_VERSION\", \"2024-12-01-preview\"), top_p=0.85, temperature=0.2, frequency_penalty=0.0, presence_penalty=0.0)\n", "\n", "executor_llm = LiteLLM(config=llm_config)\n", "optimizer_llm = LiteLLM(config=llm_config)" ] }, { "cell_type": "code", "execution_count": 2, "id": "ad0efa03", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "evoagentx.optimizers.sew_optimizer.SEWOptimizer" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "SEWOptimizer " ] }, { "cell_type": "code", "execution_count": 3, "id": "ad4b2024", "metadata": {}, "outputs": [], "source": [ "# difficult easy " ] }, { "cell_type": "code", "execution_count": 4, "id": "c95059f0", "metadata": {}, "outputs": [], "source": [ "from evoagentx.benchmark import HotPotQA" ] }, { "cell_type": "code", "execution_count": 6, "id": "84efabfa", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2025-12-09 17:48:36.501\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.benchmark.hotpotqa\u001b[0m:\u001b[36m_load_data_from_file\u001b[0m:\u001b[36m51\u001b[0m - \u001b[1mloading HotPotQA data from /gpfs/radev/home/tl688/.evoagentx/data/hotpotqa/hotpot_train_v1.1.json ...\u001b[0m\n", "\u001b[32m2025-12-09 17:48:40.023\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.benchmark.hotpotqa\u001b[0m:\u001b[36m_load_data_from_file\u001b[0m:\u001b[36m51\u001b[0m - \u001b[1mloading HotPotQA data from /gpfs/radev/home/tl688/.evoagentx/data/hotpotqa/hotpot_dev_distractor_v1.json ...\u001b[0m\n" ] } ], "source": [ "llm_config = OpenAILLMConfig(model=\"gpt-4o-mini-2024-07-18\", openai_key=OPENAI_API_KEY, top_p=0.85, temperature=0.2, frequency_penalty=0.0, presence_penalty=0.0)\n", "llm = OpenAILLM(config=llm_config)\n", "llm = executor_llm\n", "\n", "# obtain SEW workflow \n", "sew_graph = SEWWorkFlowGraph.from_dict(hotpotqa_graph_data)\n", "agent_manager = AgentManager()\n", "agent_manager.add_agents_from_workflow(sew_graph, executor_llm.config)\n", "\n", "benchmark = PubMedQASplits()\n", "\n", "# obtain Evaluator\n", "evaluator = Evaluator(llm=llm, agent_manager=agent_manager, collate_func=collate_func, num_workers=20, verbose=True)" ] }, { "cell_type": "code", "execution_count": 8, "id": "8598151b", "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "1" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(sew_graph.to_dict()['nodes'])" ] }, { "cell_type": "code", "execution_count": 9, "id": "b1f7fc18", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(sew_graph.edges)" ] }, { "cell_type": "code", "execution_count": 10, "id": "33859fa8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sew_graph.edges" ] }, { "cell_type": "code", "execution_count": 11, "id": "3c048529", "metadata": { "scrolled": true }, "outputs": [], "source": [ "# obtain SEWOptimizer after having more roles, default\n", "optimizer = SEWOptimizer(\n", " graph=sew_graph, \n", " evaluator=evaluator, \n", " llm=llm, \n", " max_steps=20,\n", " eval_rounds=3, \n", " repr_scheme=\"python\", \n", " optimize_mode=\"all\", \n", " order=\"zero-order\",\n", " max_rounds=20,\n", ")\n", "\n", "# with suppress_logger_info():\n", "# metrics = optimizer.evaluate(dataset=humaneval, eval_mode=\"test\")\n", "# print(\"Evaluation metrics: \", metrics)\n" ] }, { "cell_type": "code", "execution_count": 13, "id": "8b05058e", "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2025-12-09 17:49:17.743\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m678\u001b[0m - \u001b[1mOptimizing the SEWWorkFlowGraph workflow with python representation.\u001b[0m\n", "\u001b[32m2025-12-09 17:49:17.744\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m682\u001b[0m - \u001b[1mRun initial evaluation on the original workflow ...\u001b[0m\n", "Evaluating workflow: 2%|▏ | 1/50 [00:02<01:39, 2.03s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Task exception was never retrieved\n", "future: exception=RuntimeError('Event loop is closed')>\n", "Traceback (most recent call last):\n", " File \"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/asyncio/tasks.py\", line 277, in __step\n", " result = coro.send(None)\n", " ^^^^^^^^^^^^^^^\n", " File \"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/site-packages/litellm/utils.py\", line 873, in _client_async_logging_helper\n", " GLOBAL_LOGGING_WORKER.ensure_initialized_and_enqueue(\n", " File \"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/site-packages/litellm/litellm_core_utils/logging_worker.py\", line 322, in ensure_initialized_and_enqueue\n", " self.enqueue(async_coroutine)\n", " File \"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/site-packages/litellm/litellm_core_utils/logging_worker.py\", line 131, in enqueue\n", " self._queue.put_nowait(task)\n", " File \"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/asyncio/queues.py\", line 147, in put_nowait\n", " self._wakeup_next(self._getters)\n", " File \"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/asyncio/queues.py\", line 63, in _wakeup_next\n", " waiter.set_result(None)\n", " File \"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/asyncio/futures.py\", line 263, in set_result\n", " self.__schedule_callbacks()\n", " File \"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/asyncio/futures.py\", line 173, in __schedule_callbacks\n", " self._loop.call_soon(callback, self, context=ctx)\n", " File \"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/asyncio/base_events.py\", line 762, in call_soon\n", " self._check_closed()\n", " File \"/gpfs/radev/home/tl688/.conda/envs/evoagentx/lib/python3.11/asyncio/base_events.py\", line 520, in _check_closed\n", " raise RuntimeError('Event loop is closed')\n", "RuntimeError: Event loop is closed\n", "Evaluating workflow: 14%|█▍ | 7/50 [00:02<00:09, 4.73it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 28%|██▊ | 14/50 [00:02<00:03, 9.77it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.35294117647058826, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.375, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.7272727272727273, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 34%|███▍ | 17/50 [00:03<00:04, 8.10it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 38%|███▊ | 19/50 [00:03<00:05, 5.90it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 42%|████▏ | 21/50 [00:04<00:05, 5.71it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 48%|████▊ | 24/50 [00:04<00:03, 6.98it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 56%|█████▌ | 28/50 [00:04<00:02, 10.25it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.888888888888889, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 60%|██████ | 30/50 [00:05<00:02, 8.48it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.19354838709677416, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 64%|██████▍ | 32/50 [00:05<00:02, 8.67it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.06896551724137931, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:49:23.697\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a72a2935542991f9a20c546', 'answer': 'Velvetpark', 'question': 'Is Velvetpark or Shape magazine written more for a lesbian and queer-identified female readership?', 'supporting_facts': [['Velvetpark', 1], ['Shape (magazine)', 1], ['Shape (magazine)', 7]], 'context': [['Jeguk Sinmun', ['The Jeguk Sinmun (\"Imperial Post\"; 1898-1910) was a Seoul-based Korean language newspaper founded in 1898 by Yi Jong-myeon.', ' It was published using the purely vernacular Hangeul script and attracted a largely lower or middle class and female readership.', ' It was less political than the other papers of the period, concentrating instead on social issues.', ' One of its early reporters was the young Syngman Rhee.']], ['Velvetpark', ['Velvetpark: Dyke Culture in Bloom is a lesbian and feminist arts and culture website that regularly features music, literature, theater, fine arts, film, television, and social activism as it impacts queer culture.', ' \"Velvetpark\" also hosts a social network and dating community for lesbians and queer-identified women.']], ['Anna Kalata', ['Anna Kalata (born May 10, 1964, Milanówek, Poland) is a Polish politician, celebrity and occasional actress.', ' She was a member of the populist Samoobrona party.', \" In Jarosław Kaczyński's cabinet she was the minister of labour and social policy.\", ' She participated in the 12th season of Taniec z Gwiazdami (the Polish version of Dancing With The Stars).', ' After losing 38 kg she appeared on the cover of Shape magazine.']], ['Shape (magazine)', [\"Shape is a monthly English language fitness magazine started by Weider Publications in 1981, founded by Christine MacIntyre (a pioneer in women's free weight fitness) and became the number one women's fitness magazine.\", ' At that time, Weider Enterprises consisted primarily of the bodybuilding magazine \"Muscle & Fitness\".', ' Joe Weider and Christine MacIntyre had differing views of how to present \"Shape\", Weider endorsing a less journalistic and more commercial approach to articles, MacIntyre endorsing a more academic, doctor-based magazine.', ' Weider also endorsed a sexier approach to editorial while MacIntyre endorsed a healthier look for women, eschewing sexiness in the models and the copy.', ' MacIntyre largely won that battle, editing a magazine that required that every byline have an advanced medical degree, that cover models should look healthy rather than sexy, and that sexist language be avoided.', ' Christine MacIntyre was the editor-in-chief until her death in 1988.', ' Tara Kraft is the current editor-in-chief.', ' \"Shape\" found a readership based on that formula.']], ['Cynthia Heimel', ['Cynthia Heimel (née Glick) (born 1947 in Philadelphia) is a feminist humorist writer from Oakland, California.', ' She is a columnist and the author of satirical books primarily aimed at a female readership and known for their unusual titles, as well as a playwright and television writer.']], ['Femme', ['Femme is a lesbian sexual identity that was created in the working class lesbian bar culture of the 1950s.', ' It is a term used to distinguish feminine lesbian and bisexual women from their butch/stud lesbian counterparts and partners.', ' Today the term is still used in this way but in recent years - following the influence of Queer gender identity theories - its meaning has, sometimes contentiously, been expanded to describe a queer-identified person who is feminine in their presentation regardless of their gender or sexuality.']], ['Chapstick lesbian', ['A chapstick lesbian is a sub-group within lesbianism that Ellen DeGeneres popularised in 1997 in her show \"Ellen\".', ' It was originally constructed as response to the phrase \"lipstick lesbian\" that emerged in 1990, which refers to a femme lesbian who emphasises their female identity through their self-presentation.', ' The slang term \"chapstick lesbian\" identifies a category on the femme-butch lesbian continuum, where the female homosexual has a gender identity bias towards femme lesbianism, although does not identify or fit the criteria of being a lipstick lesbian.', ' The word is frequently used as an alternative to the term \"soft-butch\" lesbian or androgynous.', ' The key attributes recognisable of a chapstick lesbian is that they have a casual dress-code and lack of desire to wear make-up.', ' Next to this, they are also viewed as being athletic in nature and have a notable interest in sport.']], ['Elana Amsterdam', ['Elana Amsterdam is the New York Times Bestselling author of \"Paleo Cooking from Elana\\'s Pantry\".', ' She writes cookbooks for gluten-free cooking, using almond flour and coconut flour as a gluten-free alternative to wheat flour.', ' Her book, \"The Gluten-Free Almond Flour Cookbook\", was named one of the \"Best Cookbooks of 2009\" by The Denver Post.', ' Amsterdam has partnered with the California Almond Board in conjunction with her works.', ' Her blog, elanaspantry.com, was named one of the top 50 food blogs by Cision.', \" Amsterdam contributed an article to Shape Magazine and she was featured on Fox News's On the Hunt with Jonathan Hunt.\"]], ['Shōjo manga', ['Shōjo, shojo, or shoujo manga (少女漫画 , shōjo manga ) is manga aimed at a teenage female readership.', ' The name romanizes the Japanese 少女 (shōjo), literally \"young woman\".', ' Shōjo manga covers many subjects in a variety of narrative styles, from historical drama to science fiction, often with a focus on romantic relationships or emotions.', ' Strictly speaking, however, shōjo manga does not comprise a style or genre, but rather indicates a target demographic.']], ['Celesbian', ['The term celesbian (a portmanteau of \"celebrity\" and \"lesbian\") originally referred to a female celebrity known or reputed to be a lesbian and popular within the LGBT community.', ' Celesbianism as a Western media phenomenon came into vogue in 2008, when several female celebrities presented themselves as lesbians.', ' The term was first used by New Yorkers Pam Franco and Susan Levine, a disk jockey.', ' It was used in a full-page ad in a lesbian nightlife magazine, \"GO MAGAZINE\".', ' The ad was for the Mz Hip and Fit NY contest, the idea of Denise Cohen of Denco Designs & Events.', ' The contest was a search for the hottest lesbian in the United States.', ' The term \"celesbian\" was used for the celebrity lesbian judges.']]], 'type': 'comparison', 'level': 'hard'}\n", "Error: The input to LLMOutputParser.parse should be a str, but found .\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 68%|██████▊ | 34/50 [00:06<00:02, 5.62it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 72%|███████▏ | 36/50 [00:06<00:02, 6.52it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 76%|███████▌ | 38/50 [00:06<00:01, 7.68it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 80%|████████ | 40/50 [00:06<00:01, 8.96it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 84%|████████▍ | 42/50 [00:06<00:00, 9.18it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 88%|████████▊ | 44/50 [00:06<00:00, 8.41it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.17391304347826084, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 94%|█████████▍| 47/50 [00:07<00:00, 6.43it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 98%|█████████▊| 49/50 [00:07<00:00, 7.13it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 100%|██████████| 50/50 [00:10<00:00, 4.97it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-09 17:49:27.850\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m685\u001b[0m - \u001b[1mInitial metrics: {'f1': 0.6862887507768912, 'em': 0.4897959183673469, 'acc': 0.7959183673469388}\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-09 17:49:28.398\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.021 | Total tokens: 104919 | Current cost: $0.000 | Current tokens: 76\u001b[0m\n", "\u001b[32m2025-12-09 17:49:29.101\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.021 | Total tokens: 105064 | Current cost: $0.000 | Current tokens: 145\u001b[0m\n", "\u001b[32m2025-12-09 17:49:29.102\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36mparse_workflow_python_repr\u001b[0m:\u001b[36m403\u001b[0m - \u001b[33m\u001b[1mFailed to parse workflow string: 'llm_config'. Return the original workflow.\u001b[0m\n", "\u001b[32m2025-12-09 17:49:29.102\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 0: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n", "\u001b[32m2025-12-09 17:49:29.480\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.021 | Total tokens: 105132 | Current cost: $0.000 | Current tokens: 68\u001b[0m\n", "\u001b[32m2025-12-09 17:49:29.986\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.021 | Total tokens: 105259 | Current cost: $0.000 | Current tokens: 127\u001b[0m\n", "\u001b[32m2025-12-09 17:49:29.987\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 1: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n", "\u001b[32m2025-12-09 17:49:32.694\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.021 | Total tokens: 105602 | Current cost: $0.000 | Current tokens: 343\u001b[0m\n", "\u001b[32m2025-12-09 17:49:35.313\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.021 | Total tokens: 106000 | Current cost: $0.000 | Current tokens: 398\u001b[0m\n", "\u001b[32m2025-12-09 17:49:35.314\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36mparse_workflow_python_repr\u001b[0m:\u001b[36m403\u001b[0m - \u001b[33m\u001b[1mFailed to parse workflow string: 'llm_config'. Return the original workflow.\u001b[0m\n", "\u001b[32m2025-12-09 17:49:35.315\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 2: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n", "\u001b[32m2025-12-09 17:49:37.630\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.021 | Total tokens: 106265 | Current cost: $0.000 | Current tokens: 265\u001b[0m\n", "\u001b[32m2025-12-09 17:49:38.256\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.021 | Total tokens: 106597 | Current cost: $0.000 | Current tokens: 332\u001b[0m\n", "\u001b[32m2025-12-09 17:49:38.257\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36mparse_workflow_python_repr\u001b[0m:\u001b[36m403\u001b[0m - \u001b[33m\u001b[1mFailed to parse workflow string: 'llm_config'. Return the original workflow.\u001b[0m\n", "\u001b[32m2025-12-09 17:49:38.257\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 3: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n", "\u001b[32m2025-12-09 17:49:38.664\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.021 | Total tokens: 106710 | Current cost: $0.000 | Current tokens: 113\u001b[0m\n", "\u001b[32m2025-12-09 17:49:39.205\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.021 | Total tokens: 106855 | Current cost: $0.000 | Current tokens: 145\u001b[0m\n", "\u001b[32m2025-12-09 17:49:39.206\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36mparse_workflow_python_repr\u001b[0m:\u001b[36m403\u001b[0m - \u001b[33m\u001b[1mFailed to parse workflow string: 'llm_config'. Return the original workflow.\u001b[0m\n", "\u001b[32m2025-12-09 17:49:39.206\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 4: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n", "\u001b[32m2025-12-09 17:49:39.759\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.021 | Total tokens: 106942 | Current cost: $0.000 | Current tokens: 87\u001b[0m\n", "\u001b[32m2025-12-09 17:49:40.404\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.021 | Total tokens: 107078 | Current cost: $0.000 | Current tokens: 136\u001b[0m\n", "\u001b[32m2025-12-09 17:49:40.405\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 5: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n", "\u001b[32m2025-12-09 17:49:40.939\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.021 | Total tokens: 107194 | Current cost: $0.000 | Current tokens: 116\u001b[0m\n", "\u001b[32m2025-12-09 17:49:41.649\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.021 | Total tokens: 107348 | Current cost: $0.000 | Current tokens: 154\u001b[0m\n", "\u001b[32m2025-12-09 17:49:41.650\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36mparse_workflow_python_repr\u001b[0m:\u001b[36m403\u001b[0m - \u001b[33m\u001b[1mFailed to parse workflow string: 'llm_config'. Return the original workflow.\u001b[0m\n", "\u001b[32m2025-12-09 17:49:41.650\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 6: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n", "\u001b[32m2025-12-09 17:49:42.504\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.021 | Total tokens: 107469 | Current cost: $0.000 | Current tokens: 121\u001b[0m\n", "\u001b[32m2025-12-09 17:49:43.315\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.021 | Total tokens: 107653 | Current cost: $0.000 | Current tokens: 184\u001b[0m\n", "\u001b[32m2025-12-09 17:49:43.317\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36mparse_workflow_python_repr\u001b[0m:\u001b[36m403\u001b[0m - \u001b[33m\u001b[1mFailed to parse workflow string: 'llm_config'. Return the original workflow.\u001b[0m\n", "\u001b[32m2025-12-09 17:49:43.317\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 7: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n", "\u001b[32m2025-12-09 17:49:45.235\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.022 | Total tokens: 107906 | Current cost: $0.000 | Current tokens: 253\u001b[0m\n", "\u001b[32m2025-12-09 17:49:45.925\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.022 | Total tokens: 108213 | Current cost: $0.000 | Current tokens: 307\u001b[0m\n", "\u001b[32m2025-12-09 17:49:45.927\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 8: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n", "\u001b[32m2025-12-09 17:49:48.388\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.022 | Total tokens: 108509 | Current cost: $0.000 | Current tokens: 296\u001b[0m\n", "\u001b[32m2025-12-09 17:49:49.103\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.022 | Total tokens: 108859 | Current cost: $0.000 | Current tokens: 350\u001b[0m\n", "\u001b[32m2025-12-09 17:49:49.105\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 9: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-09 17:49:50.621\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.022 | Total tokens: 109055 | Current cost: $0.000 | Current tokens: 196\u001b[0m\n", "\u001b[32m2025-12-09 17:49:51.217\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.022 | Total tokens: 109319 | Current cost: $0.000 | Current tokens: 264\u001b[0m\n", "\u001b[32m2025-12-09 17:49:51.219\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 10: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n", "\u001b[32m2025-12-09 17:49:52.726\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.022 | Total tokens: 109504 | Current cost: $0.000 | Current tokens: 185\u001b[0m\n", "\u001b[32m2025-12-09 17:49:53.325\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.022 | Total tokens: 109757 | Current cost: $0.000 | Current tokens: 253\u001b[0m\n", "\u001b[32m2025-12-09 17:49:53.326\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 11: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n", "\u001b[32m2025-12-09 17:49:54.041\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.022 | Total tokens: 109872 | Current cost: $0.000 | Current tokens: 115\u001b[0m\n", "\u001b[32m2025-12-09 17:49:55.248\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.022 | Total tokens: 110084 | Current cost: $0.000 | Current tokens: 212\u001b[0m\n", "\u001b[32m2025-12-09 17:49:55.249\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36mparse_workflow_python_repr\u001b[0m:\u001b[36m403\u001b[0m - \u001b[33m\u001b[1mFailed to parse workflow string: 'llm_config'. Return the original workflow.\u001b[0m\n", "\u001b[32m2025-12-09 17:49:55.249\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 12: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n", "\u001b[32m2025-12-09 17:49:55.700\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.022 | Total tokens: 110172 | Current cost: $0.000 | Current tokens: 88\u001b[0m\n", "\u001b[32m2025-12-09 17:49:56.267\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.022 | Total tokens: 110326 | Current cost: $0.000 | Current tokens: 154\u001b[0m\n", "\u001b[32m2025-12-09 17:49:56.268\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36mparse_workflow_python_repr\u001b[0m:\u001b[36m403\u001b[0m - \u001b[33m\u001b[1mFailed to parse workflow string: 'llm_config'. Return the original workflow.\u001b[0m\n", "\u001b[32m2025-12-09 17:49:56.268\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 13: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n", "\u001b[32m2025-12-09 17:49:58.214\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.022 | Total tokens: 110573 | Current cost: $0.000 | Current tokens: 247\u001b[0m\n", "\u001b[32m2025-12-09 17:50:00.238\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.023 | Total tokens: 110993 | Current cost: $0.000 | Current tokens: 420\u001b[0m\n", "\u001b[32m2025-12-09 17:50:00.239\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36mparse_workflow_python_repr\u001b[0m:\u001b[36m403\u001b[0m - \u001b[33m\u001b[1mFailed to parse workflow string: 'llm_config'. Return the original workflow.\u001b[0m\n", "\u001b[32m2025-12-09 17:50:00.240\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 14: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n", "\u001b[32m2025-12-09 17:50:01.141\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.023 | Total tokens: 111131 | Current cost: $0.000 | Current tokens: 138\u001b[0m\n", "\u001b[32m2025-12-09 17:50:01.941\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.023 | Total tokens: 111361 | Current cost: $0.000 | Current tokens: 230\u001b[0m\n", "\u001b[32m2025-12-09 17:50:01.942\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36mparse_workflow_python_repr\u001b[0m:\u001b[36m403\u001b[0m - \u001b[33m\u001b[1mFailed to parse workflow string: 'llm_config'. Return the original workflow.\u001b[0m\n", "\u001b[32m2025-12-09 17:50:01.942\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 15: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n", "\u001b[32m2025-12-09 17:50:02.262\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.023 | Total tokens: 111429 | Current cost: $0.000 | Current tokens: 68\u001b[0m\n", "\u001b[32m2025-12-09 17:50:02.765\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.023 | Total tokens: 111556 | Current cost: $0.000 | Current tokens: 127\u001b[0m\n", "\u001b[32m2025-12-09 17:50:02.766\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 16: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n", "\u001b[32m2025-12-09 17:50:03.209\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.023 | Total tokens: 111631 | Current cost: $0.000 | Current tokens: 75\u001b[0m\n", "\u001b[32m2025-12-09 17:50:03.873\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.023 | Total tokens: 111775 | Current cost: $0.000 | Current tokens: 144\u001b[0m\n", "\u001b[32m2025-12-09 17:50:03.874\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36mparse_workflow_python_repr\u001b[0m:\u001b[36m403\u001b[0m - \u001b[33m\u001b[1mFailed to parse workflow string: 'llm_config'. Return the original workflow.\u001b[0m\n", "\u001b[32m2025-12-09 17:50:03.874\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 17: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n", "\u001b[32m2025-12-09 17:50:06.888\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.023 | Total tokens: 112183 | Current cost: $0.000 | Current tokens: 408\u001b[0m\n", "\u001b[32m2025-12-09 17:50:07.675\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.023 | Total tokens: 112658 | Current cost: $0.000 | Current tokens: 475\u001b[0m\n", "\u001b[32m2025-12-09 17:50:07.676\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36mparse_workflow_python_repr\u001b[0m:\u001b[36m403\u001b[0m - \u001b[33m\u001b[1mFailed to parse workflow string: 'llm_config'. Return the original workflow.\u001b[0m\n", "\u001b[32m2025-12-09 17:50:07.676\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 18: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n", "\u001b[32m2025-12-09 17:50:09.142\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.023 | Total tokens: 112836 | Current cost: $0.000 | Current tokens: 178\u001b[0m\n", "\u001b[32m2025-12-09 17:50:10.278\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.models.model_utils\u001b[0m:\u001b[36mupdate_cost\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1mTotal cost: $0.023 | Total tokens: 113095 | Current cost: $0.000 | Current tokens: 259\u001b[0m\n", "\u001b[32m2025-12-09 17:50:10.279\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36mparse_workflow_python_repr\u001b[0m:\u001b[36m403\u001b[0m - \u001b[33m\u001b[1mFailed to parse workflow string: 'llm_config'. Return the original workflow.\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-09 17:50:10.279\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m700\u001b[0m - \u001b[33m\u001b[1mError in step 19: can only concatenate str (not \"NoneType\") to str. Skip this step.\u001b[0m\n", "\u001b[32m2025-12-09 17:50:10.279\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m707\u001b[0m - \u001b[1mReach the maximum number of steps 20. Stop the optimization.\u001b[0m\n", "\u001b[32m2025-12-09 17:50:10.280\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36moptimize\u001b[0m:\u001b[36m710\u001b[0m - \u001b[1mRestore the best graph from the snapshot ...\u001b[0m\n", "\u001b[32m2025-12-09 17:50:10.280\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.optimizers.sew_optimizer\u001b[0m:\u001b[36mrestore_best_graph\u001b[0m:\u001b[36m814\u001b[0m - \u001b[1mRestore the best graph from snapshot with metrics {'f1': 0.6862887507768912, 'em': 0.4897959183673469, 'acc': 0.7959183673469388} ...\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 0%| | 2/500 [00:01<05:31, 1.50it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.5454545454545454, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 1%| | 5/500 [00:01<01:57, 4.21it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.3333333333333333, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 2%|▏ | 9/500 [00:02<00:55, 8.81it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 2%|▏ | 11/500 [00:02<01:00, 8.04it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.75, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 3%|▎ | 13/500 [00:02<01:01, 7.88it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 3%|▎ | 15/500 [00:03<01:05, 7.37it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 3%|▎ | 16/500 [00:03<01:03, 7.68it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 4%|▍ | 21/500 [00:03<00:50, 9.51it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 5%|▍ | 23/500 [00:04<01:09, 6.85it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 5%|▍ | 24/500 [00:04<01:06, 7.21it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 5%|▌ | 25/500 [00:04<01:15, 6.29it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 6%|▌ | 28/500 [00:04<01:08, 6.88it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.3333333333333333, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 6%|▌ | 31/500 [00:08<04:25, 1.77it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 6%|▋ | 32/500 [00:08<03:40, 2.12it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 7%|▋ | 37/500 [00:08<01:33, 4.94it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.4444444444444445, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.17391304347826086, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 8%|▊ | 39/500 [00:09<01:12, 6.32it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.3333333333333333, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 8%|▊ | 41/500 [00:09<01:03, 7.22it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.7272727272727273, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 9%|▊ | 43/500 [00:10<01:55, 3.96it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 9%|▉ | 46/500 [00:10<01:22, 5.50it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.19999999999999998, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 10%|▉ | 48/500 [00:10<01:12, 6.23it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 10%|▉ | 49/500 [00:11<01:16, 5.90it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 10%|█ | 52/500 [00:11<01:10, 6.39it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 11%|█ | 53/500 [00:11<01:30, 4.95it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 12%|█▏ | 59/500 [00:12<01:03, 6.97it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 12%|█▏ | 61/500 [00:12<00:52, 8.42it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.125, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 13%|█▎ | 67/500 [00:13<00:45, 9.43it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 14%|█▍ | 70/500 [00:13<00:55, 7.74it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.30769230769230765, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 14%|█▍ | 72/500 [00:14<01:01, 6.99it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 15%|█▌ | 76/500 [00:14<00:52, 8.02it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 16%|█▌ | 79/500 [00:14<00:50, 8.35it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 16%|█▌ | 80/500 [00:14<00:50, 8.40it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 17%|█▋ | 84/500 [00:15<00:49, 8.47it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.16666666666666669, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-09 17:50:33.299\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5abb66c05542992ccd8e7f3e', 'answer': 'spot-fixing', 'question': 'What offence were opening batsman Khalid Latif and 5 other cricketers suspended for, in February 2017?', 'supporting_facts': [['2017 Pakistan Super League spot-fixing scandal', 1], ['Khalid Latif (cricketer)', 1], ['Khalid Latif (cricketer)', 2]], 'context': [['Ray Gripper', ['Raymond Arthur Gripper (born 7 July 1938), in Salisbury, Southern Rhodesia, was a cricketer.', ' He was a right-handed opening batsman and became a regular member of the Rhodesian side for 15 years starting in 1957–58, at one stage captaining them.', ' His highest score was an innings of 279 not out made against Orange Free State in 1967–68.', ' This remained a Currie Cup record for some years.', ' His son Trevor played Test cricket for Zimbabwe, also as an opening batsman.']], ['Khalid Latif (cricketer)', ['Khalid Latif (Urdu: \\u200e ), (born 4 November 1985 in Karachi) is a Pakistani cricketer.', ' A right-handed opening batsman, Latif captained Pakistan in the 2004 U-19 Cricket World Cup win and the 2010 Asian Games bronze medal win.', ' In 2017, the Pakistan Cricket Board banned Latif from all forms of cricket for five years, for his involvement in spot-fixing.']], ['Len Hutton', ['Sir Leonard Hutton (23 June 1916\\xa0– 6 September 1990) was an English cricketer who played as an opening batsman for Yorkshire from 1934 to 1955 and for England in 79 Test matches between 1937 and 1955. \"', 'Wisden Cricketers\\' Almanack\" described him as one of the greatest batsmen in the history of cricket.', ' He set a record in 1938 for the highest individual innings in a Test match in only his sixth Test appearance, scoring 364 runs against Australia, a milestone that stood for nearly 20 years (and remains an England Test record).', ' In 1952, he became the first professional cricketer of the 20th Century to captain England in Tests; under his captaincy England won the Ashes the following year for the first time in 19 years.', \" Following the Second World War, he was the mainstay of England's batting, and the team depended greatly on his success.\"]], ['Bill Ponsford', ['William Harold \"Bill\" Ponsford (19 October 1900\\xa0– 6 April 1991) was an Australian cricketer.', ' Usually playing as an opening batsman, he formed a successful and long-lived partnership opening the batting for Victoria and Australia with Bill Woodfull, his friend and state and national captain.', ' Ponsford is the only player to twice break the world record for the highest individual score in first-class cricket; Ponsford and Brian Lara are the only cricketers to twice score 400\\xa0runs in an innings.', \" Ponsford holds the Australian record for a partnership in Test cricket, set in 1934 in combination with Donald Bradman(451 for 2nd wicket)—the man who broke many of Ponsford's other individual records.In fact,he along with Don Bradman set the record for the highest partnership ever for any wicket in Test cricket history when playing in away soil (451 runs for the second wicket)\"]], ['2017 Pakistan Super League spot-fixing scandal', [\"The 2017 Pakistan Super League spot-fixing scandal arose in February 2017 when the Pakistan Cricket Board (PCB) suspended cricketers under its anti-corruption code in an ongoing investigation backed by International Cricket Council (ICC)'s Anti-Corruption and Security Unit on spot-fixing during the 2017 Pakistan Super League.\", ' The six cricketers suspended by the PCB are: Sharjeel Khan (on 10 February), Khalid Latif (on 10 February), Nasir Jamshed (on 13 February), Mohammad Irfan (on 14 March), Shahzaib Hasan (on 17 March) and Mohammad Nawaz (16 May).']], ['Tamim Iqbal', ['Tamim Iqbal Khan (Bengali: তামিম ইকবাল খান ; born 20 March 1989) is an international Bangladeshi cricketer and former Test captain of the team.Tamim is arguably the best batsman in Bangladesh.', ' Tamim made his One Day International debut in 2007 and played his first Test the following year.', \" A left-handed opening batsman, he is the Bangladeshi's most successful runscorer to date.\", ' Between December 2010 and September 2011 he was vice-captain of the national side.', ' Considered as the best ever opening batsman for Bangladesh, Tamim has set up centuries in all three formats of the game and is also the first Bangladeshi to score 10,000 international runs.']], ['Sidath Wettimuny', ['Sidath Wettimuny is a former Sri Lankan cricketer, who played Test cricket and One Day Internationals as an opening batsman from 1982 to 1987.', ' Wettimuny was a typical opening batsman in that he often played very defensively, grafting for his runs, and his ODI strike rate of 48 shows this quite clearly.']], ['Khalid Latif (imam)', ['Khalid Latif is Executive Director and Chaplain (Imam) for the Islamic Center at New York University (NYU).']], ['Roy Virgin', ['A right-handed opening batsman, Virgin had a mostly solid but unspectacular career in first-class cricket, except for two individual seasons, one for each of his two counties, during which he looked as good as any opening batsman in county cricket and was mentioned as a possible Test player.']], ['Angus Robson', ['Angus James Robson (born 19 February 1992 in Sydney) is an Australian cricketer who played for Leicestershire.', ' He is the brother of England and Middlesex opening batsman, Sam.', ' He has appeared in 26 first-class matches as a right-handed batsman who bowls leg breaks.', ' He was part of the Leicestershire side that completed a famous first victory in 3 years against Essex on 3 June 2015, playing a big role in the side as an opening batsman, scoring 120 and 71 in the game.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 5 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 17%|█▋ | 85/500 [00:22<10:47, 1.56s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-09 17:50:33.396\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5ae6479a55429929b0807b1b', 'answer': '\"That Bizarre Girl\"', 'question': \"Jun Ji-hyun rose to fame after her as a girl in a film that's title means what?\", 'supporting_facts': [['Jun Ji-hyun', 0], ['Jun Ji-hyun', 1], ['My Sassy Girl', 0]], 'context': [['My Sassy Girl', ['My Sassy Girl (Korean: 엽기적인 그녀 ; literally, \"That Bizarre Girl\") is a 2001 South Korean romantic comedy film directed by Kwak Jae-yong, starring Jun Ji-hyun and Cha Tae-hyun.']], ['Il Mare', ['Il Mare (; lit.', ' \"time-transcending love\") is a 2000 South Korean film, starring Jun Ji-hyun and Lee Jung-jae, and directed by Lee Hyun-seung.', ' The title, \"Il Mare\", means \"The Sea\" in Italian, and is the name of the seaside house which is the setting of the story.', ' The two protagonists both live there two years apart in time, but are able to communicate through a mysterious mailbox.']], ['Happy Together (1999 TV series)', ['Happy Together () is a 1999 South Korean television series starring Lee Byung-hun, Song Seung-heon, Kim Ha-neul, Jo Min-su, and Jun Ji-hyun It aired on SBS from June 16 to August 5, 1999 on Wednesdays and Thursdays at 21:55 for 16 episodes.', ' Starring young actors who would go on to become Korean TV and film stars, the hit drama revolves around five children who were separated at the death of their parents, and the love, conflicts, and reconciliation that these siblings go through when they meet again as adults.']], ['Windstruck', ['Windstruck (; lit.', ' \"Let me introduce (you to) my girlfriend\") is a 2004 South Korean romantic comedy.', ' It stars Jun Ji-hyun, Jang Hyuk, and was directed by Kwak Jae-yong.', ' The film held its premiere in Hong Kong, attended by Jang and Jun, on 28 May 2004, being the first Korean film to do so.', ' It was released on June 3, 2004 by CJ Entertainment and ran at 123 minutes.']], ['Jun Ji-hyun', ['Jun Ji-hyun (born Wang Ji-hyun on 30 October 1981), also known as Gianna Jun, is a South Korean actress.', ' She rose to fame for her role as The Girl in the romantic comedy \"My Sassy Girl\" (2001), one of the highest-grossing Korean comedies of all time.', ' Other notable films include \"Il Mare\" (2000), \"Windstruck\" (2004), \"The Thieves\" (2012), \"The Berlin File\" (2013) and \"Assassination\" (2015).']], ['My Love from the Star', ['My Love from the Star (; literally \"You Who Came from the Stars\") is a South Korean television series starring Jun Ji-hyun, Kim Soo-hyun, Park Hae-jin and Yoo In-na in lead.', ' Written by Park Ji-eun, it is a romantic fantasy story about an alien who landed on Earth in the Joseon Dynasty and, 400 years later, falls in love with a top actress in the modern era.', \" It aired on SBS from December 18, 2013 to February 27, 2014 on Wednesdays and Thursdays at 22:00 for 21 episodes; the production company extended the original 20-episode run with one episode, due to high viewers' demand.\"]], ['The Berlin File', ['The Berlin File (; lit.', ' \"Berlin\") is a 2013 South Korean spy action thriller film written and directed by Ryoo Seung-wan.', ' Ha Jung-woo stars as a North Korean agent in Berlin who is betrayed and cut loose when a weapons deal is exposed.', ' Together with his wife, a translator at the North Korean embassy in Berlin played by Jun Ji-hyun, they try to escape being purged, with Ryoo Seung-bum and Han Suk-kyu playing North and South Korean operatives on their trail.']], ['White Valentine', ['White Valentine () is a 1999 Korean romantic film directed by Yang Yun-ho.', ' It stars Park Shin-yang with Jun Ji-hyun in her movie debut.']], ['Daisy (2006 film)', ['Daisy () is a 2006 film directed by Hong Kong filmmaker Andrew Lau of the \"Infernal Affairs\" trilogy.', ' \"Daisy\" is an urban romantic melodrama involving young painter Hye-young (Jun Ji-hyun), Interpol detective Jeong Woo (Lee Sung-jae), and professional hitman Park Yi (Jung Woo-sung).']], ['The Legend of the Blue Sea', ['The Legend of the Blue Sea () is a 2016-2017 South Korean television series starring Jun Ji-hyun and Lee Min-ho.', \" Inspired by a classic Joseon legend from Korea's first collection of unofficial historical tales about a fisherman who captures and releases a mermaid, this drama tells the love story of a con-artist and a mermaid who travels across the ocean to find him.\", ' It aired on SBS every Wednesday and Thursday at 22:00 (KST) started from 16 November 2016 until 25 January 2017.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 5 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 17%|█▋ | 87/500 [00:23<07:05, 1.03s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-09 17:50:38.287\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5ac29aa655429967731025b2', 'answer': '26,000', 'question': 'Eduard Schweizer teaches at a German university with over how many students? ', 'supporting_facts': [['Eduard Schweizer', 0], ['University of Zurich', 0]], 'context': [['University of Zurich', ['The University of Zurich (UZH, German: \"Universität Zürich\" ), located in the city of Zürich, is the largest university in Switzerland, with over 26,000 students.', ' It was founded in 1833 from the existing colleges of theology, law, medicine and a new faculty of philosophy.']], ['BMVA Summer School', ['BMVA Summer School is an annual summer school on computer vision, organised by the British Machine Vision Association and Society for Pattern Recognition (BMVA).', ' The course is residential, usually held over five days, and consists of lectures and practicals in topics in image processing, computer vision, pattern recognition.', ' It is intended that the course will complement and extend the material in existing technical courses that many students/researchers will encounter in their early stage of postgraduate training or caeeers.', ' It aims to broaden awareness of knowledge and techniques in Vision, Image Computing and Pattern Recognition, and to develop appropriate research skills, and for students to interact with their peers, and to make contacts among those who will be the active researchers of their own generation.', ' It is open to students from both UK and non-UK universities.', ' The registration fees vary based on time of registration and are in general slightly higher for non-UK students.', ' The summer school has been hosted locally by various universities in UK that carry out Computer Vision research, e.g., Kingston University, the University of Manchester and Swansea University.']], ['University Cooperative Housing Association', ['University Cooperative Housing Association (UCHA) is a student housing cooperative in Westwood, Los Angeles serving the University of California, Los Angeles (UCLA) campus.', ' Approximately 400 students live there and in addition to housing UCLA students, UCHA offers housing to students of any college, including the UCLA Extension and Santa Monica College.', ' UCHA operates three buildings, Hardman-Hansen Hall, Essene Hall, and Robison Hall, the latter being a renovated version of the Landfair Apartments and cultural landmark designed by Richard Neutra.', ' Jim Morrison, of The Doors, purportedly lived at UCHA during his time at UCLA.', ' Green Day and Margaret Cho performed at UCHA in the early 1990s.', ' In addition to the UCLA campus, Hardman-Hansen and Robison Halls were used as filming locations for the 1982 horror film, The Dorm That Dripped Blood.', \" Many students of China's Lost Generation studying at UCLA reside at UCHA.\"]], ['Eduard Schweizer', ['Eduard Schweizer (1913-2006) was a Swiss New Testament scholar who taught at the University of Zurich for an extended period.', ' He won the Burkitt Medal for Biblical Studies in 1996.']], ['National High School Debate League of China', ['The National High School Debate League of China, or simply NHSDLC, is an English-language high school debate league serving Mainland China.', ' It uses the Public Forum debate format.', ' Each year, the NHSDLC sees around 50,000 students participate in its debate workshops and around 12,000 students participate in its regional or national tournaments that it hosts in more than 33 cities in China.', ' According to The Economist, many students believe participating will help their application to a Western university.', \" It was founded in 2012, and it hosted one of China's first ever English-language high school national debate tournaments for local students at Peking University in May 2013.\", ' Each year, its national debate championship hosted in Beijing attracts 450 students from around China.', ' NHSDLC is partnered with Harvard College Mentors for Urban Debate, Penn for Youth Debate, the Chicago Debate Society, the Yale Debate Association, Sunrise International Education, and the Stanford Youth Debate Initiative.']], ['Donald B. Fullerton', ['Donald B. Fullerton (July 6, 1892\\xa0– April 9, 1985) was a Christian missionary and teacher who founded the Princeton Evangelical Fellowship and served with it from 1931 until 1980.', ' He was noted for convincing many students at Princeton University of the truth of the Christian faith.', ' Arthur Glasser also credited his conversion to Dr. Fullerton, through hearing him speak at the Keswick Bible Conference.', ' In addition to his evangelistic efforts, Dr. Fullerton was a major spiritual influence on many students including Paul Pressler, a major figure in the Conservative resurgence of the Southern Baptist Convention, and the noted Reformed theologian John Frame.', ' He was a member of the Princeton University Class of 1913 and received an honorary Doctorate of Ministry from Grace Theological Seminary.']], ['Matthias Eduard Schweizer', ['Matthias Eduard Schweizer (8 August 1818 – 23 October 1860) was a Swiss chemist.']], ['Port Moody Secondary School', ['Port Moody Secondary School is a public coeducational high school located in Port Moody, British Columbia.', ' The school is notable for offering the International Baccalaureate Program and the Career Preparation Program to its students, which many students travel from other districts to participate in.', ' There are approximately 400 students in the pre-International Baccalaureate Diploma programme and the International Baccalaureate diploma programme tracks.', \" Port Moody Secondary is widely known in the area for sending an impressive number of students to the world's most selectivities universities.\", ' In the past three years, students have matriculated to schools such as: Harvard University, Princeton University, University of Chicago, University of Pennsylvania, Cornell University, UC Berkeley and Dartmouth College.', ' Port Moody serves grades nine through twelve and currently has an enrollment of 1,312 students.', ' The school is respected for its academics, visual arts, musical arts and athletic programs.']], [\"Pennsylvania Governor's School for the Sciences\", [\"The Pennsylvania Governor's School for the Sciences (PGSS) is one of the Pennsylvania Governor's Schools of Excellence, a group of five-week summer programs for gifted high school students in the state of Pennsylvania.\", ' Carnegie Mellon University in Pittsburgh has hosted the program since its inception in 1982.', ' Most recently, it has been directed by Physics Professor Dr. Barry Luokkala.', ' Participants are required to be Pennsylvania high school students between their junior and senior years and are required to live in the dormitories for the full five weeks of the program.', ' Admission is very competitive - approximately 500 of the most scientifically gifted students in the state compete for 56 to 60 slots in the program.', \" The aim of PGSS is to promote interest in science rather than to advance students' knowledge in a specific area.\", ' The curriculum includes five \"core\" courses in Biology, Chemistry, Computer Science, Mathematics and Physics, and numerous electives.', ' In addition to taking classes, students are required to participate in a lab course and a research-style team project.', ' The emphasis is on cooperation, rather than competition - students are encouraged to both collaborate with other students on academic work and to interact socially.', ' The Residence Life staff provides a number of structured social events to foster friendship and teamwork.', ' There is at least one event per day and is advertised on the social calendar in the dorm lobby.', ' For many students, the social development gained from the program rivals the scientific knowledge they acquire.', ' The students leave the program with a strong bond; most attend an organized reunion the following year after the 4th week of the program.']], ['KJSCE Symphony', ['Symphony, the annual cultural festival of K. J. Somaiya College of Engineering, has created its name and popularity among Engineering and Management institutes far and wide for the last decade.', ' Every year many students from various institutes be a part of this festival.', ' The main aim is to promote, encourage and exhibit the talents of the students on a common platform and create interest in the classical, vocal and instrumental music.', ' Symphony hosts more than 9000 students every year.', ' Symphony has been graced by artists of the magnitude of Pt.', ' Hariprasad Chaurasia, Pt ShivKumar Sharma, Louis Banks, Hariharan, Indus Creed, Parikrama, KK, Bombay Vikings, Taufiq Qureshi, Dagar, Suraj Jagan, and Ustad Zakir Hussain.', ' The event also has a social touch to propagate a message relevant to the times like AIDS awareness, etc.', ' There have also been Auto Shows and an Army display at Symphony.', ' The organization is done by students which is also a time for building strong camaraderie and teamwork.', ' Many students look back fondly at the memories gathered during this phase of their lives.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 18%|█▊ | 88/500 [00:27<12:23, 1.80s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-09 17:50:38.590\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5ae4d2a55542990ba0bbb161', 'answer': 'major water deity', 'question': 'Giselle Cossard was known as Mother Giselle of what type of diety?', 'supporting_facts': [['Giselle Cossard', 0], ['Giselle Cossard', 1], ['Yemoja', 0]], 'context': [['Type A Kō-hyōteki-class submarine', ['The \"Type A Ko-hyoteki\" (甲標的甲型 , Kō-hyōteki kō-gata , Target \\'A\\', Type \\'A\\') class was a class of Japanese midget submarines (\"Ko-hyoteki\") used during World War II.', ' They had hull numbers but no names.', ' For simplicity, they are most often referred to by the hull number of the mother submarine.', ' Thus, the midget carried by \"I-16\"-class submarine was known as I-16\\'s boat, or \"I-16tou.\"']], ['Isabel Briggs Myers', ['Isabel Briggs Myers (October 18, 1897\\xa0– May 5, 1980) was an American author and co-creator of a personality inventory known as the Myers–Briggs Type Indicator (MBTI).', ' Briggs Myers created the MBTI with her mother, Katharine Cook Briggs.']], ['Tripura Sundari Temple', ['Tripura Sundari Temple is situated in the ancient Udaipur, about 55\\xa0km from Agartala, Tripura believed to be one of the holiest Hindu shrines in this part of the country.', ' Popularly known as Matabari, crowns in a small hillock and is served by the red-robed priests who traditionally, minister to the mother goddess Tripura Sundari.', ' Considered to be one of the 51 Shakti Peethas, consists of a square type sanctum of the typical Bengali hut.', \" It is believed that Sati's right foot fell here during Lord Shiva's Dance.\", ' The temple consist a square type sanctum with a conical dome.', ' It was constructed by Maharaja Dhanya Manikya in 1501A.', 'D, there are two identical images of the same deity inside the temple.', ' They are known as Tripura Sundari (5\\xa0feet high) and Chhotima (2\\xa0feet high) in Tripura.', \" The idol of Kali is worshiped at the temple of Tripura Sundari in the form of 'Soroshi'.\", ' One is made of kasti stone which is reddish black in colour.', ' It is believed that the idol was Chhotima was carried by king in battlefield.', ' This temple is also known as Kurma Pitha because it the temple premises resembles kurma i.e. tortoise.', ' Every year on Diwali, a famous Mela takes place near the temple which is visited by more than two lakhs pilgrims.']], ['Neonatal isoerythrolysis', ['Neonatal isoerythrolysis, also known as hemolytic icterus, is a disease most commonly seen in kittens and foals, but has also been reported in puppies.', ' In the kitten this is referred to as \"fading kitten syndrome.\"', ' It occurs when the mother has antibodies against the blood type of the newborn.']], ['Sweet Porridge', ['\"Sweet Porridge\", often known in English under the title of \"The Magic Porridge Pot\", is a folkloric German fairy tale recorded by the Brothers Grimm, as tale number 103 in \"Grimm\\'s Fairy Tales\", in the 19th century.', ' It is Aarne-Thompson type 565, the magic mill.', ' Other tales of this type include \"Why the Sea Is Salt\" and \"The Water Mother\".']], ['Giselle Cossard', ['Giselle Cossard Binon Omindarewa, (31 May 1923, Tangier - 21 January 2016, Duke of Caxias), Mãe-de-santo of Candomblé of Rio de Janeiro, was a French Brazilian anthropologist and writer.', ' She was also known as Mother Giselle of Yemoja, Daughter of Saint John of Goméia, Initiated for the Orisha Yemoja.']], ['Yemoja', ['Yemoja (Yoruba: \"Yemọja\" ) is a major water deity from the Yoruba religion.', ' She is an orisha and the mother of all orishas, having given birth to the 14 Yoruba gods and goddesses.', ' She is often syncretized with either Our Lady of Regla in the afrocuban diaspora or various other Virgin Mary figures of the Catholic Church, a practice that emerged during the era of the Trans-Atlantic slave trade.', ' Yemoja is motherly and strongly protective, and cares deeply for all her children, comforting them and cleansing them of sorrow.', ' She is said to be able to cure infertility in women, and cowrie shells represent her wealth.', ' She does not easily lose her temper, but when angered she can be quite destructive and violent, as the flood waters of turbulent rivers.']], ['Theotokos of Vladimir', ['The Theotokos of Vladimir (Greek: Θεοτόκος του Βλαντίμιρ ), also known as Our Lady of Vladimir, Vladimir Mother of God, or Virgin of Vladimir (Russian: Владимирская Икона Божией Матери ) is a medieval Byzantine icon of the Virgin and Child.', ' In 1169 Andrei Bogolyubsky sacked Kiev, and, after plundering the city, stole much religious artwork, including a Byzantine \"Mother of God\" icon which was transferred to Vladimir (for references see Yury Dolgorukiy and Andrey Bogolyubskiy).', ' It is one of the most venerated Orthodox icons and a fine and early example of the iconography of the \"Eleusa\" (tenderness) type, with the Christ child snuggling up to his mother\\'s cheek.', ' The \"Theotokos\" (Greek for Virgin Mary, literally meaning \"Birth-Giver of God\") is regarded as the holy protectress of Russia.', ' The icon is displayed in the Tretyakov Gallery, Moscow in a functioning church in the grounds of the museum.', ' Her feast day is June 23rd o.s. / July 6th n.s. Even more than most famous icons, the original has been copied repeatedly for centuries, and many copies have considerable artistic and religious significance of their own.']], ['Portuguese poetry', ['The beginnings of Portuguese poetry go back to the early 12th century, around the time when the County of Portugal separated from the medieval Kingdom of Galicia in the northwest of the Iberian Peninsula.', ' It was in this region that the ancestral language of both modern Portuguese and modern Galician, known today as Galician-Portuguese, was the common language of the people.', \" Like the troubadour culture in the Iberian Peninsula and the rest of Europe, Galician-Portuguese poets sang the love for a woman, which often turned into personal insults, as she had hurt her lover's pride.\", ' However, this region produced a specific type of song, known as \"cantigas de amigo\" (songs of a friend).', ' In these, the lyrical subject is always a woman (though the singer was male) talking about her friend (lover) from whom she has been separated - by war or other activities - as shown in the Reconquista.', ' They discuss the loneliness that the woman feels.', \" But some poems also project eroticism, or confess the lover's meeting in a secret place, often through a dialogue she has with her mother or with natural elements (such could be considered a custom adapted from the pagan peoples in the region).\", ' Epic poetry was also produced, as was common in Romantic medieval regions (\"Gesta de D. Afonso Henriques\", of unknown authorship).']], [\"Eve's pudding\", [\"Eve's pudding, also known as Mother Eve's pudding, is a type of traditional British pudding now made from apples and Victoria sponge cake mixture.\", ' The apples are allowed to stew at the bottom of the baking dish while the cake mixture cooks on top.', ' The name is a reference to the biblical Eve.', \" It is a simplified version of Duke of Cumberland's pudding.\", ' The earliest known version dates from 1824, predating baking powder, and therefore uses grated bread and shredded suet.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 18%|█▊ | 89/500 [00:28<10:02, 1.47s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-09 17:50:38.644\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5ae7ac495542993210983eee', 'answer': 'What Ever Happened to Baby Jane?', 'question': 'What film came out first, All the Marbles or What Ever Happened to Baby Jane?', 'supporting_facts': [['...All the Marbles', 0], ['Robert Aldrich', 0], ['Robert Aldrich', 1]], 'context': [['David Cerda', ['David Cerda (born June 13, 1961, Hammond, Indiana) is an American performer and playwright based in Chicago, Illinois.', ' He is currently the artistic director for Hell In A Handbag Productions.', ' His campy, highly theatrical plays have made him an infamous icon within the Chicago theater scene.', ' He has written and appeared in a transgressive adaptation of \"Rudolph, the Red-Hosed Reindeer\", \"How ‘What Ever Happened to Baby Jane?’', ' Happened\" and POSEIDON!', ' An Upside-Down Musical which won the New York International Fringe Festival Best Ensemble Award.']], ['Dave Willock', ['Dave Willock (August 13, 1909 – November 12, 1990) was an American character actor.', ' Willock appeared in 181 films and television series from 1939 to 1989.', ' He is probably most familiar to modern audiences from his performance as Baby Jane Hudson\\'s father in the opening scenes of the cult classic \"What Ever Happened to Baby Jane?', '\" (1962).', ' He played seven different characters on CBS\\'s \"Green Acres\" with Eddie Albert and Eva Gabor, mostly portraying clerks or elevator operators.']], ['...All the Marbles', ['…All the Marbles (reissued as The California Dolls) is a 1981 comedy-drama film about the trials and travails of a female wrestling tag team and their manager.', ' It was directed by Robert Aldrich (his final film) and stars Peter Falk, Vicki Frederick and Laurene Landon.', ' The Pittsburgh Steeler hall of famer \"Mean\" Joe Greene plays himself.']], ['What Ever Happened to...', ['What Ever Happened to... is a 1991 American made-for-television thriller drama film directed by David Greene and adapted for the small screen by Brian Taggert, based on the novel \"What Ever Happened to Baby Jane?', '\" by Henry Farrell and the 1962 theatrical film of the same name.', ' It stars real-life sisters Lynn Redgrave as Baby Jane Hudson and Vanessa Redgrave as Blanche Hudson, in the roles previously played by Bette Davis and Joan Crawford in the 1962 adaptation.']], ['Robert Aldrich', ['Robert Burgess Aldrich (August 9, 1918 – December 5, 1983) was an American film director, writer and producer, notable for such films as \"Vera Cruz\" (1954), \"Kiss Me Deadly\" (1955), \"The Big Knife\" (1955), \"What Ever Happened to Baby Jane?', '\" (1962), \"Hush… Hush, Sweet Charlotte\" (1964), \"The Flight of the Phoenix\" (1965), \"The Dirty Dozen\" (1967) and \"The Longest Yard\" (1974).']], ['What Ever Happened to Baby Toto?', ['What Ever Happened to Baby Toto?', ' (Italian: \"Che fine ha fatto Totò Baby?\" )', ' is a 1964 Italian black comedy film written and directed by Ottavio Alessi.', ' It is a parody of Robert Aldrich\\'s \"What Ever Happened to Baby Jane?', '\".']], ['Psycho-biddy', ['Psycho-biddy is a colloquial term for a subgenre of the horror/thriller movie that features a formerly-glamorous older woman who has become mentally unbalanced and terrorizes those around her.', ' The genre officially began in 1962 with the film \"What Ever Happened to Baby Jane?', '\" (though it had some antecedents) and lasted through the mid-1970s.', ' It has also been referred to by the terms Grande Dame Guignol, hagsploitation and hag horror.', ' Renata Adler, in her \"The New York Times\" review for the 1968 film \"The Anniversary\", referred to the genre as \"the Terrifying Older Actress Filicidal Mummy genre.\"']], ['What Ever Happened to Baby Jane? (1962 film)', ['What Ever Happened to Baby Jane?', ' is a 1962 American psychological thriller–horror film produced and directed by Robert Aldrich, starring Bette Davis and Joan Crawford, about an aging former actress who holds her paraplegic sister captive in an old Hollywood mansion.', ' The screenplay by Lukas Heller is based on the 1960 novel of the same name by Henry Farrell.', \" Upon the film's release, it was met with widespread critical and box office acclaim and was later nominated for five Academy Awards, winning one for Best Costume Design, Black and White.\"]], ['Baby Jane Hudson', ['Baby Jane Hudson is a fictional character and the antagonist of Henry Farrell\\'s 1960 novel \"What Ever Happened to Baby Jane?', '\" She was portrayed by Bette Davis in the 1962 film adaptation and by Lynn Redgrave in the 1991 made-for-TV remake.', ' The 1962 production is the better-known, with Bette Davis earning an Academy Award nomination for her performance.', ' The character is portrayed by Susan Sarandon,who plays Bette Davis, in the TV anthology \"Feud: Bette and Joan\" aired in 2017.']], ['Debbie Burton', ['Debbie Burton was an American singer.', ' She is best known for dubbing the singing voice of the young Baby Jane Hudson (played by child actress Julie Allred) in the 1962 film \"What Ever Happened to Baby Jane?', '\", singing the song \"I\\'ve Written a Letter to Daddy\".', ' Burton also sang a duet with Bette Davis, the rock and roll song \"What Ever Happened to Baby Jane?\"', ', written by Frank DeVol and Lukas Heller.', ' It was released as a promotional single, with Burton\\'s rendition of \"I\\'ve Written a Letter to Daddy\" on the flipside.', ' An instrumental version of \"What Ever Happened to Baby Jane?\"', ' can be heard in the movie.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 18%|█▊ | 91/500 [00:29<07:28, 1.10s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.4, 'em': 0.0, 'acc': 1.0}\n", "\u001b[32m2025-12-09 17:50:39.951\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 18%|█▊ | 92/500 [00:29<06:17, 1.08it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:50:40.641\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 19%|█▊ | 93/500 [00:30<05:53, 1.15it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:50:41.308\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 10 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 19%|█▉ | 94/500 [00:30<05:31, 1.22it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:50:41.600\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 12 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 19%|█▉ | 95/500 [00:31<04:34, 1.48it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:50:41.793\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 12 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 19%|█▉ | 96/500 [00:31<03:39, 1.84it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:50:43.567\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 19%|█▉ | 97/500 [00:33<05:59, 1.12it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 20%|█▉ | 98/500 [00:34<07:03, 1.05s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.4615384615384615, 'em': 0.0, 'acc': 1.0}\n", "\u001b[32m2025-12-09 17:50:45.533\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 16 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 20%|█▉ | 99/500 [00:35<05:58, 1.12it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:50:45.800\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 9 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 20%|██ | 100/500 [00:35<04:44, 1.41it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:50:48.813\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 4 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 20%|██ | 101/500 [00:38<09:15, 1.39s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:50:48.892\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 7 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:50:51.356\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 7 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 21%|██ | 103/500 [00:40<08:50, 1.34s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:50:51.478\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 21%|██ | 104/500 [00:41<06:50, 1.04s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:50:51.667\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 9 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 21%|██ | 105/500 [00:41<05:22, 1.23it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:50:52.965\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 21%|██ | 106/500 [00:42<06:13, 1.06it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:50:53.483\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 7 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "\u001b[32m2025-12-09 17:50:53.483\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 10 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 21%|██▏ | 107/500 [00:43<05:25, 1.21it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:50:53.505\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 10 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:50:54.239\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 13 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 22%|██▏ | 110/500 [00:43<03:20, 1.95it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-09 17:50:54.817\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 13 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 22%|██▏ | 112/500 [00:44<02:49, 2.29it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:50:55.814\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 12 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 23%|██▎ | 113/500 [00:45<03:31, 1.83it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:50:56.117\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 11 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 23%|██▎ | 114/500 [00:45<03:10, 2.03it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:50:58.897\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 11 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 23%|██▎ | 115/500 [00:48<06:39, 1.04s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:51:00.541\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 23%|██▎ | 116/500 [00:50<07:37, 1.19s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:51:01.975\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 9 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 23%|██▎ | 117/500 [00:51<08:01, 1.26s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:51:02.741\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 8 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 24%|██▎ | 118/500 [00:52<07:08, 1.12s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:51:02.836\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 7 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:51:03.652\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 7 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 24%|██▍ | 120/500 [00:53<05:14, 1.21it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:51:03.679\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 7 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:51:03.983\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 7 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 24%|██▍ | 122/500 [00:53<03:36, 1.75it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 25%|██▍ | 123/500 [00:54<03:50, 1.63it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.7368421052631579, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:51:07.061\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 4 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 25%|██▍ | 124/500 [00:56<06:21, 1.02s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:51:07.206\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 4 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 25%|██▌ | 125/500 [00:56<04:58, 1.26it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:51:07.656\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 25%|██▌ | 126/500 [00:57<04:23, 1.42it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:51:07.684\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:51:07.979\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 26%|██▌ | 128/500 [00:57<02:55, 2.12it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:51:09.716\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 26%|██▌ | 129/500 [00:59<04:45, 1.30it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:51:11.458\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 26%|██▌ | 130/500 [01:01<06:14, 1.01s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 26%|██▌ | 131/500 [01:01<06:00, 1.02it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 27%|██▋ | 133/500 [01:02<03:36, 1.70it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 27%|██▋ | 134/500 [01:02<02:55, 2.08it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 28%|██▊ | 139/500 [01:02<01:00, 5.94it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 28%|██▊ | 141/500 [01:02<00:48, 7.42it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 29%|██▉ | 145/500 [01:03<00:43, 8.16it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 29%|██▉ | 147/500 [01:03<00:37, 9.37it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 30%|██▉ | 149/500 [01:04<00:51, 6.79it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 30%|███ | 151/500 [01:04<01:07, 5.20it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 31%|███ | 154/500 [01:04<00:49, 6.97it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 31%|███ | 156/500 [01:05<00:39, 8.64it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 32%|███▏ | 158/500 [01:07<02:15, 2.53it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 32%|███▏ | 159/500 [01:08<03:21, 1.69it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 32%|███▏ | 160/500 [01:08<02:56, 1.92it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 32%|███▏ | 161/500 [01:09<02:42, 2.09it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.17391304347826084, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 33%|███▎ | 165/500 [01:09<01:19, 4.21it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.14285714285714288, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 34%|███▍ | 170/500 [01:09<00:44, 7.43it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.7692307692307693, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 34%|███▍ | 172/500 [01:10<01:11, 4.58it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.1111111111111111, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 35%|███▌ | 176/500 [01:11<00:52, 6.15it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.625, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 35%|███▌ | 177/500 [01:11<00:55, 5.78it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 36%|███▌ | 179/500 [01:11<00:53, 5.98it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.08333333333333334, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 36%|███▌ | 181/500 [01:11<00:50, 6.26it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.24000000000000002, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 36%|███▋ | 182/500 [01:12<01:09, 4.59it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 37%|███▋ | 185/500 [01:12<00:58, 5.35it/s]Unclosed client session\n", "client_session: \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Evaluating workflow: 37%|███▋ | 186/500 [01:13<01:12, 4.31it/s]Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9900987.447861675)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900987.424225066)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900987.867846096)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900987.464736516)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900987.741797127)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900987.66892916)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900987.87465422)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9900992.646339724)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9900988.90205124)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9900989.398130026)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900990.117923113)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9900987.820199456)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900987.40997424)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900988.09916453)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900987.507396225)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900987.3982037)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900988.372753982)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900987.694346609)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900987.77917563)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900987.101632733)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900987.729158033)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900989.015242323)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900992.473841628)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900989.650610672)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900989.440078005)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9900989.24249936)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9900990.35859264)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9900989.765040504)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900991.250709284)])']\n", "connector: \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9900989.682905024)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9900990.989716977)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900990.143409532)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9900990.316368464)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900991.51176343)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9900991.611756269)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9900991.093860747)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9900991.724075453)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9900991.51944911)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9900989.73087138)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900989.668732414)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900989.456839943)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900991.078686344)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9900991.32544321)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900992.205809632)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900992.874902256)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900991.884841861)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900991.389818488)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900995.138487663)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900992.010101842)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9900992.757979088)])']\n", "connector: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed client session\n", "client_session: \n", "Unclosed connector\n", "connections: ['deque([(, 9901040.062205683)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9901039.45589229)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9901040.603285013)])']\n", "connector: \n", "Unclosed connector\n", "connections: ['deque([(, 9901039.137437675)])']\n", "connector: \n", "Evaluating workflow: 39%|███▊ | 193/500 [01:13<00:24, 12.71it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metricsmetrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", " {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.38095238095238093, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.3076923076923077, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.14285714285714288, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 39%|███▉ | 196/500 [01:14<00:40, 7.50it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 40%|███▉ | 198/500 [01:14<00:48, 6.24it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 40%|████ | 200/500 [01:15<00:50, 5.89it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 40%|████ | 202/500 [01:15<00:48, 6.16it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 41%|████ | 205/500 [01:15<00:48, 6.03it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 42%|████▏ | 208/500 [01:15<00:34, 8.40it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 42%|████▏ | 212/500 [01:31<07:40, 1.60s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-09 17:51:41.813\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 13 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:51:45.170\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 9 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 43%|████▎ | 214/500 [01:34<07:50, 1.65s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:51:45.272\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a83b1e75542990548d0b220', 'answer': 'screenwriter', 'question': 'Worker: What professional title to both Christopher Nolan and Paul Schrader boast?', 'supporting_facts': [['Christopher Nolan', 0], ['Paul Schrader', 0]], 'context': [['Hardcore (1979 film)', ['Hardcore is a 1979 American crime drama film written and directed by Paul Schrader and starring George C. Scott, Peter Boyle and Season Hubley.', ' The story concerns a father searching for his daughter, who has vanished only to appear in a pornographic film.', ' Writer-director Schrader had previously written the screenplay for Martin Scorsese\\'s \"Taxi Driver\", and both films share a theme of exploring an unseen subculture.']], ['Paul Schrader', ['Paul Joseph Schrader (born July 22, 1946) is an American screenwriter, film director, and film critic.', ' Schrader wrote or co-wrote screenplays for four Martin Scorsese films: \"Taxi Driver\" (1976), \"Raging Bull\" (1980), \"The Last Temptation of Christ\" (1988), and \"Bringing Out the Dead\" (1999).', ' Schrader has also directed 18 feature films, including his directing debut crime drama, \"Blue Collar\" (co-written with his brother, Leonard Schrader), the crime drama \"Hardcore\" (a loosely autobiographical film also written by Schrader), his 1982 remake of the horror classic \"Cat People\", the crime drama \"American Gigolo\" (1980), the biographical drama \"\" (1985), the cult film \"Light Sleeper\" (1992), the drama \"Affliction\" (1997), the biographical film \"Auto Focus\" (2002), and the erotic dramatic thriller \"The Canyons\" (2013).']], ['Christopher Nolan', ['Christopher Edward Nolan ( ; born 30 July 1970) is an English-American film director, producer, and screenwriter.', ' He is one of the highest-grossing directors in history, and among the most successful and acclaimed filmmakers of the 21st century.']], ['The Yakuza', ['The Yakuza is a 1974 Japanese-American neo-noir gangster film directed by Sydney Pollack, written by Leonard Schrader, Paul Schrader, and Robert Towne.', \" The film is about a man (Robert Mitchum) who returns to Japan after several years away in order to rescue his friend's kidnapped daughter.\", ' Following a lackluster initial release, the film has since gained a cult following.']], ['Obsession (1976 film)', ['Obsession is a 1976 psychological thriller/mystery film directed by Brian De Palma, starring Cliff Robertson, Geneviève Bujold, John Lithgow, and Stocker Fontelieu.', ' The screenplay was by Paul Schrader, from a story by De Palma and Schrader.', \" Bernard Herrmann provided the film's soundtrack prior to his death in 1975.\", ' The story is about a New Orleans businessman who is haunted by guilt following the death of his wife and daughter during a kidnapping-rescue attempt.', ' Years after the tragedy, he meets and falls in love with a young woman who is the exact look-alike of his long dead wife.']], ['Old Boyfriends', ['Old Boyfriends is a 1979 American drama film directed by Joan Tewkesbury and written by Paul Schrader and Leonard Schrader.', ' The film stars Talia Shire, Richard Jordan, Keith Carradine, John Belushi, John Houseman and Buck Henry.', ' The film was released on April 13, 1979, by Embassy Pictures.']], ['The Walker', ['The Walker is a 2007 American-British drama film written and directed by Paul Schrader.', ' It is an independent production and is the latest installment in Schrader\\'s \"night workers\" series of films, starting with \"Taxi Driver\" in 1976, followed by \"American Gigolo\" in 1980 and \"Light Sleeper\" in 1992.']], ['Blue Collar (film)', ['Blue Collar is a 1978 American crime drama film directed by Paul Schrader, in his directorial debut.', ' It was written by Schrader and his brother Leonard, and stars Richard Pryor, Harvey Keitel and Yaphet Kotto.']], ['Dying of the Light (film)', ['Dying of the Light is a 2014 American psychological thriller film written and directed by Paul Schrader and starring Nicolas Cage, Anton Yelchin and Irène Jacob about a government agent who must track down and kill a terrorist before he loses his full memory from a disease.', ' It was released theatrically and through VOD formats by Lionsgate on December 5, 2014.', ' The film received extremely negative reviews, with controversy surrounding the heavy tampering and reediting of the footage by the studio, who denied Schrader final-cut privilege and led him and principal members of the cast to disown the released version and campaign against it.']], ['Leonard Schrader', ['Leonard Schrader (November 30, 1943 – November 2, 2006) was an American screenwriter and director, most notable for his ability to write Japanese language films and for his many collaborations with his brother, Paul Schrader.', ' He earned an Academy Award Nomination for the screenplay he wrote for the film \"Kiss of the Spider Woman\".']]], 'type': 'comparison', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 7 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 43%|████▎ | 215/500 [01:34<06:33, 1.38s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-09 17:51:51.514\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a8986fd55429938390d4046', 'answer': 'animation', 'question': 'What technique does Cam Clarke and Akira have in common?', 'supporting_facts': [['Cam Clarke', 0], ['Akira (1988 film)', 0]], 'context': [['John Clarke (mountaineer)', ['John Clarke, CM (February 25, 1945 – January 23, 2003) was a Canadian explorer, mountaineer, conservationist, and wilderness educator.', ' He was born in Ireland to Brigit Ann Clarke (née Conway) and Thomas Kevin Clarke, and died in Vancouver, British Columbia of a brain tumor.', ' From 1964 until his death in 2003 Clarke spent at least six months of each year on extended backcountry trips, usually into the Coast Mountains of British Columbia using the technique of dropping food caches from small planes along an intended route, then traveling that route for weeks at a time.', ' His routes regularly led him along the high ridges and glaciated icefields of the west coast, and allowed him to make hundreds of first ascents of the many mountains along the way.', ' Many of these trips exceeded 30 days in length, and were often done solo, simply because nobody could afford the time to accompany him.']], ['Akira (given name)', ['Akira (あきら, アキラ ) is a common Japanese given name.', ' There are several kanji for Akira.', ' A popular kanji is 明 which means \"bright\", \"intelligent\", or \"clear\".', ' Though Akira is normally used to name males, it can be a female name as well.']], ['Cam Clarke', ['Cameron Arthur \"Cam\" Clarke (born November 6, 1957) is an American voice actor and singer, known for his voice-work in animation and video games.', ' He is best known for providing the voices of Leonardo and Rocksteady in the original \"Teenage Mutant Ninja Turtles\" animated series and Shotaro Kaneda in the 1989 original English dub of \"Akira\".', ' He often voices teenagers and other similarly young characters.', ' One of his prominent roles in video games was voicing Liquid Snake in the \"Metal Gear\" series.']], ['Common area maintenance charges', ['Common Area Maintenance charges, or CAM for short, are one of the net charges billed to tenants in a commercial triple net (NNN) lease, and are paid by tenants to the landlord of a commercial property.', ' A CAM charge is an additional rent, charged on top of base rent, and is mainly composed of maintenance fees for work performed on the common area of a property.']], ['Shaky camera', ['Handheld camera, shaky cam, queasy cam, queasicam, hand-held camera or free camera is a cinematographic technique where stable-image techniques are purposely dispensed with.', ' The camera is held in the hand, or given the appearance of being hand-held, and in many cases shots are limited to what one photographer could have accomplished with one camera.', ' Shaky cam is often employed to give a film sequence an ad hoc, electronic news-gathering, or documentary film feel.', ' It suggests unprepared, unrehearsed filming of reality, and can provide a sense of dynamics, immersion, instability or nervousness.', ' The technique can be used to give a pseudo-documentary or \"cinéma vérité\" appearance to a film.']], ['Transfer (propaganda)', ['Transfer is a technique used in propaganda and advertising.', ' Also known as association, this is a technique of projecting positive or negative qualities (praise or blame) of a person, entity, object, or value (an individual, group, organization, nation, patriotism, etc.) to another in order to make the second more acceptable or to discredit it.', ' It evokes an emotional response, which stimulates the target to identify with recognized authorities.', ' Often highly visual, this technique often utilizes symbols superimposed over other visual images.', \" An example of common use of this technique in the United States is for the President to be filmed or photographed in front of the country's flag.\", ' Another technique used is celebrity endorsement.']], ['Computer-aided manufacturing', ['Computer-aided manufacturing (CAM) is the use of software to control machine tools and related ones in the manufacturing of workpieces.', ' This is not the only definition for CAM, but it is the most common; CAM may also refer to the use of a computer to assist in all operations of a manufacturing plant, including planning, management, transportation and storage.', ' Its primary purpose is to create a faster production process and components and tooling with more precise dimensions and material consistency, which in some cases, uses only the required amount of raw material (thus minimizing waste), while simultaneously reducing energy consumption.']], ['Mosaic (film)', ['Mosaic is an animated superhero film about a new character created by Stan Lee.', ' It features the voice of Anna Paquin as Maggie Nelson with supporting roles done by Kirby Morrow, Cam Clarke, Garry Chalk, Ron Halder, and Nicole Oliver.', ' It was released under the \"Stan Lee Presents\" banner, which is a series of direct-to-DVD animated films distributed by POW Entertainment with Anchor Bay Entertainment.', ' The story was by Stan Lee, with the script by former X-Men writer Scott Lobdell.']], ['Akira (1988 film)', [\"Akira is a 1988 Japanese adult animated science fiction film directed by Katsuhiro Otomo, produced by Ryōhei Suzuki and Shunzō Katō, and written by Otomo and Izo Hashimoto, based on Otomo's manga of the same name.\"]], ['Jesus Green', ['Jesus Green is a park in the north of central Cambridge, England.', ' It is located north of Jesus College, hence the name.', ' Jesus Ditch runs along the southern edge Jesus Green.', ' On the northern edge of Jesus Green is the River Cam, with Chesterton Road (the A1303) on the opposite side.', ' To the east is Victoria Avenue and beyond that Midsummer Common, common land that is still used for grazing.', ' Victoria Avenue crosses the Cam at Victoria Bridge, connecting to Chesterton Road, at the northeastern corner of Jesus Green.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 11 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 43%|████▎ | 216/500 [01:41<11:03, 2.34s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-09 17:51:51.544\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5abea05f5542991f661061b6', 'answer': 'Biscayne National Park to the east and Everglades National Park to the west', 'question': 'South Dade High School is located between what two national parks?', 'supporting_facts': [['South Dade High School', 0], ['Homestead, Florida', 0]], 'context': [['Miami Northwestern Senior High School', ['Miami Northwestern Senior High School is a public 4-year high school located in Miami, Florida, United States, serving students in grades 9-12 from the Liberty City neighborhood of Miami.', ' The school colors are old gold and royal blue.', ' The average annual enrollment is approximately 1,800 students.', ' Miami Northwestern was founded in 1955 to serve the increasing population of northern Miami.', \" Shortly after the school's inception, the Bull was chosen as the official school mascot from the former Dorsey High School.\", ' Miami Northwestern originally served as an all-black high school.', ' Beginning in 1966, Dade County high schools stopped being segregated, and most students from Booker T. Washington transferred to Northwestern (and Miami Jackson Senior High School) in 1967–1968.']], ['Yala National Park', ['Yala National Park is the most visited and second largest national park in Sri Lanka.', ' The park consists of five blocks, two of which are now open to the public, and also adjoining parks.', \" The blocks have individual names such as, Ruhuna National Park (block 1) and Kumana National Park or 'Yala East' for the adjoining area.\", ' It is situated in the southeast region of the country, and lies in Southern Province and Uva Province.', ' The park covers 979 km2 and is located about 300 km from Colombo.', ' Yala was designated as a wildlife sanctuary in 1900, and, along with Wilpattu was one of the first two national parks in Sri Lanka, having been designated in 1938.', ' The park is best known for its variety of wild animals.', ' It is important for the conservation of Sri Lankan elephants, Sri Lankan leopards and aquatic birds.']], ['Australian Alps National Parks and Reserves', ['The Australian Alps National Parks and Reserves is a group of eleven protected areas consisting of national parks, nature reserves and one wilderness park located in the Australian Capital Territory, New South Wales and Victoria and which was listed as a \"place\" on the Australian National Heritage List on 7 November 2008 under the \"Environment Protection and Biodiversity Conservation Act 1999\".', ' The listing which covers an area of 1653180 ha , contains the vast majority of alpine and sub-alpine environments in Australia.', ' The listing includes the following protected areas - Alpine, Baw Baw, Brindabella, Kosciuszko, Mount Buffalo, Namadgi and Snowy River national parks; the Avon Wilderness Park, and the Bimberi, Scabby Range and Tidbinbilla nature reserves.']], ['List of U.S. National Parks by elevation', ['This is a list of United States National Parks by elevation.', \" Most of America's national parks are located in mountainous areas.\", ' Even among those located close to the ocean, not all are flat.', ' Those few that are low-lying preserve important natural habitats that could never exist at high altitude.', ' Several national parks protect deep canyons with great vertical relief.', ' There are also three national parks whose primary features are caves, the depths of which are still being explored.']], ['High Sierra Camps', [\"The High Sierra Camps are nine rustic lodging facilities located in two national parks and a national monument in California's Sierra Nevada mountain range.\", ' Open most years from June or July to September, they are staffed camps with tent cabins and food service facilities.', ' The backcountry camps receive their supplies by pack mules.']], ['National parks of Scotland', ['National parks of Scotland are managed areas of outstanding landscape where habitation and commercial activities are restricted.', ' At present, Scotland has two national parks: Loch Lomond and The Trossachs National Park, created in 2002, and the Cairngorms National Park, created in 2003.', ' These were designated as such under the National Parks (Scotland) Act 2000 which was an early piece of legislation passed by the Scottish Parliament not long after its creation in 1999.', ' Scottish-born John Muir spearheaded the effort to create Yosemite National Park in the US, as well as the conservation movement at large.']], ['Homestead, Florida', ['Homestead is a city within Miami-Dade County in the U.S. state of Florida, between Biscayne National Park to the east and Everglades National Park to the west.', ' Homestead is primarily a Miami suburb and a major agricultural area.', ' It is a principal city of the Miami metropolitan area, which was home to an estimated 6,012,331 people at the 2015 census.']], ['Australian Alps Walking Track', ['The Australian Alps Walking Track is a long distance walking trail through the alpine areas of Victoria, New South Wales and ACT.', ' It is 655\\xa0km long, starting at Walhalla, Victoria and running through to Tharwa, ACT near Canberra.', ' The track weaves mainly through Australian national parks, such as Alpine National Park and Kosciuszko National Park, though it is not exclusively restricted to national parks.', ' It ascends many peaks including Mount Kosciuszko, Mount Bogong, and Bimberi Peak, the highest points in N.S.W., Victoria, and the A.C.T. respectively.', ' The AAWT crosses exposed high plains including the Victorian Bogong High Plains and the Main Range in NSW.', ' To walk the whole trail can take between 5 and 8 weeks.', ' Food drops or a support crew are necessary, as the trail passes through no towns, although it passes close to the ski resorts of Mt Hotham, Falls Creek, Mt Baw Baw, Thredbo, Charlotte Pass and Perisher.']], ['Yuraygir National Park', ['Yuraygir is a national park in New South Wales, Australia, located 482 km northeast of Sydney.', ' It was created in 1980, a result of the merger and enlargement of two national parks, Angourie and Red Rock National Parks, both of which had been established in 1975.', ' The name is a phonetic translation of the local indigenous tribe who had lived in the area, and had formerly been transcribed variously as Jeigir, Jiegera, Jungai, Yagir, Yegera, Yegir, Yiegera or Youngai.', ' At the time of its establishment in 1980, the park was fragmented, and parcels of land were bought over the following two decades to unite segments into a more contiguous protected area.', ' Sometimes these acquisitions required protracted negotiations (and legal disputes) with land owners.']], ['South Dade High School', ['South Dade Senior High School is a secondary school located in unincorporated Miami-Dade County, Florida, near Homestead.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 11 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "\u001b[32m2025-12-09 17:51:51.615\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5adca8215542994ed6169bbc', 'answer': 'John Mark Galecki', 'question': 'Which American actor tries to make his long distance relationship with Priya work in \"The Infestation Hypothesis\" ', 'supporting_facts': [['The Infestation Hypothesis', 2], ['Johnny Galecki', 0]], 'context': [['Rachel Specter', ['Rachel Sarah Specter (born April 9, 1980) is an American actress and writer, who is best known as the model for the RGX body spray commercials.', ' In addition to her work in commercials, Specter has guest-starred in episodes of \"How I Met Your Mother\", \"Gilmore Girls\", \"What I Like About You\", and \"Entourage\", as well as co-hosted the April 4, 2007 episode of \"Attack of the Show!', '\" and a segment of \"The Feed\" on May 23.', ' In September 2008, Specter began co-starring in the web series \"Long Distance Relationship\" on Crackle.']], ['Endurance running hypothesis', ['The endurance running hypothesis is the hypothesis that the evolution of certain human characteristics can be explained as adaptations to long distance running.', ' The hypothesis suggests that endurance running played an important role for early hominins in obtaining food.', ' Researchers have proposed that endurance running began as an adaptation for scavenging and later for persistence hunting.']], ['The Infestation Hypothesis', ['\"The Infestation Hypothesis\" is the second episode of the fifth season of \"The Big Bang Theory\" that first aired on CBS on September 22, 2011.', ' It is the 89th episode overall.', ' In it, Sheldon (Jim Parsons) becomes worried when Penny acquires a new chair, while Leonard (Johnny Galecki) tries to make his long distance relationship with Priya work.', ' The episode was watched by nearly 12 million viewers in the U.S. and received mixed reviews.']], ['Meredith Kessler', ['Meredith Brooke Kessler (born June 28, 1978) is an American professional triathlete from Columbus, Ohio who races in long distance, non-drafting triathlon events.', ' She took third place at the 2011 ITU Long Distance Triathlon World Championships and has won numerous Ironman and half-Ironman distance races as both an amateur and a professional.', \" She was named USA Triathlon's 2014 Non-Drafting Athlete of the Year.\"]], ['Johnny Galecki', ['John Mark Galecki (born April 30, 1975) is an American actor.', ' He is known for playing David Healy in the ABC sitcom \"Roseanne\" from 1992 to 1997 and Dr. Leonard Hofstadter in the CBS sitcom \"The Big Bang Theory\" since 2007.', ' Galecki also appeared in the films \"National Lampoon\\'s Christmas Vacation\" (1989), \"Prancer\" (1989), \"Suicide Kings\" (1997), \"I Know What You Did Last Summer\" (1997), \"Bookies\" (2003), and \"In Time\" (2011).']], ['Communications in Guam', ['Though Guam is a United States territory, some U.S. long distance plans and courier services list Guam as an international location.', \" As a result of Guam's being added to the North American Numbering Plan (NANP) in 1997, calls made to the U.S., Canada, or other participating countries from Guam (or to Guam from other NANP locations) only require the caller to dial a 1 followed by the area code.\", ' In this way, only domestic charges are incurred between the US and Guam on most carriers.', \" Before Guam's inclusion, calling the U.S. required dialing the international 011 first, thus resulting in higher long distance rates and less frequent calls to the U.S. by relatives in Guam.\", ' Prices of long distance calls to these destinations have dropped significantly to the point where now calling the U.S. from Guam or calling Guam from the U.S. costs the same.']], ['Permanent Roommates', ['Permanent Roommates is an Indian web series created by The Viral Fever(TVF) and Biswapati Sarkar.', ' This series revolves around a young couple,Tanya and Mikesh, who after being in a long distance relationship for 3 years, face the prospect of marriage.', ' Permanent Roommates has been renewed for a third season, which will premiere in 2018.']], ['Made in Chelsea (series 10)', ['The tenth series of Made in Chelsea, a British structured-reality television programme, began airing on 19 October 2015 on E4.', ' The official trailer for the new series was released on 29 September 2015 confirming the start date.', ' It concluded on 4 January 2016 following nine regular episodes, a Christmas special, a New Year special, and an End of Season party hosted by Rick Edwards.', ' This series was the first to include new cast members Emma Walsh, Sam Harney, Tallulah Rufus Isaacs.', ' Richard Dinan also returned to the series having last appeared during the fifth series, and Francis Boulle made a one-off return during the Christmas special.', ' This was also the final series to include original cast member Spencer Matthews, long-running cast member Oliver Proudlock, as well as Millie Wilkinson and Emily Weller, who both made their debuts during the ninth series.', \" The series focused heavily on Sam and Tiff's rocky relationship coming to an end when Tiff admits to cheating on him during the summer and rumours of Sam cheating surface, until the pair eventually reunite.\", \" It also includes Louise and Alik attempting to make their long distance relationship work with obstacles in their way, Binky and JP finally making their relationship official despite commitment issues from his part, and Spencer causing further trouble by hooking up with Ollie's latest love interest Emma.\"]], ['The Heart Machine', ['The Heart Machine is a 2014 romantic thriller film written and directed by Zachary Wigon based on his short film \"Someone Else\\'s Heart\".', \" The film centers on Cody's John Gallagher, Jr. and Virginia's Kate Lyn Sheil long distance relationship that becomes strained when evidence appears to contradict Virginia's background.\", ' The film was released in a limited release on October 24, 2014, by Filmbuff.']], ['Northwestern International University', ['Northwestern International University was one of the first colleges to offer self-directed online programs, which were based on review of prior-earned college credits, professional life-experiences, practical knowledge, research, portfolio work, and the passage of comprehensive examinations *Cite (Northwestern International University Registration Catalog).', ' N.I.U. was a member of the Long Distance Learning Council *Cite (Long Distance Learning Council Catalog).', ' Their admissions process consisted of the initial registration process, student selection, and the review of student work and experience.', ' Students had to show proof of passing content specific exams before being considered for school admission.', ' They were also required to pass comprehensive exams at the completion of their respective program.', ' Furthermore, students were expected to complete a Practicum Learning Portfolio Log.', ' The time-requirement for portfolio hours varied by subject matter.', ' Lastly, students had to successfully complete and present research, before N.I.U. would issue their degree *Cite(Northwestern International University Registration Catalog).']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 11 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 44%|████▎ | 218/500 [01:41<07:06, 1.51s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-09 17:51:51.703\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5abbc4d255429931dba144fe', 'answer': 'Weldenia', 'question': 'Which genus of plant grows originally in Mexico and Guatemala, Phylica or Weldenia?', 'supporting_facts': [['Phylica', 0], ['Phylica', 2], ['Weldenia', 1]], 'context': [['Mendoncia velloziana', ['Mendoncia velloziana is a plant native to Atlantic Coast restingas vegetation which is an ecosystem of Atlantic Forest biome.', ' In addition, this plant grows either in Cerrado vegetation of Brazil.', ' This plant grows in following states of Brazil: Bahia, Ceará Minas Gerais Rio de Janeiro, São Paulo, Paraná and Santa Catarina, and it is usually visited by the hummingbirds.']], ['Agave ghiesbreghtii', ['Agave ghiesbreghtii is an evergreen plant belonging to the family Asparagaceae, subfamily Agavoideae.', ' The plant grows in clustering rosettes, up to 75\\xa0cm in diameter and 50\\xa0cm tall with wide leaves which are guttered on top.', ' In spring the plant produces dense greenish brown to purple flowers on the top half of the unbranched spike which measures between 2.5m - 5m tall.', ' The species is endemic in Guatemala and the State of Mexico in Mexico.']], ['Weldenia', ['Weldenia is a monotypic genus of flowering plant in the Commelinaceae family, first described in 1829.', ' It has one single species: Weldenia candida, which grows originally in Mexico and Guatemala.']], ['Pinguicula orchidioides', ['Pinguicula orchidioides is a perennial rosette-forming insectivorous herb native to Mexico and Guatemala.', ' A species of butterwort, it forms summer rosettes of flat, succulent leaves up to 5\\xa0centimeters (4\\xa0in) long, which are covered in mucilagenous (sticky) glands that attract, trap, and digest arthropod prey.', ' Nutrients derived from the prey are used to supplement the nutrient-poor substrate that the plant grows in.', ' Uniquely among \"Pinguicula\" species from the Americas, \"p. orchidioides\" produces gemma-like basal buds which elongate into stolons and serve as a means of asexual reproduction.', ' In the winter the plant forms a non-carnivorous rosette of small, fleshy leaves that conserves energy while food and moisture supplies are low.', ' Single purple flowers appear between July and September on upright stalks up to 22 centimeters long.']], ['Salvia divinorum', [\"Salvia divinorum (also known as sage of the diviners, ska maría pastora, seer's sage, yerba de la pastora and just salvia) is a psychoactive plant which can induce visions and other spiritual experiences. Its native habitat is in cloud forest in the isolated Sierra Mazateca of Oaxaca, Mexico, where it grows in shady and moist locations.\", ' The plant grows to over a meter high, has hollow square stems, large leaves, and occasional white flowers with violet calyxes.', ' Botanists have not determined whether \"Salvia divinorum\" is a cultigen or a hybrid; native plants reproduce vegetatively, rarely producing viable seed.']], ['Argemone albiflora', ['Argemone albiflora, the white prickly poppy, also known as the bluestem prickly poppy or the Texas prickly poppy, is a small erect plant with a decorative white flower with a yellow latex.', ' It is deeply rooted with yellow or red stamens.', ' The plant is known for the sharp prickles on its stem and leaves.', ' The sepals fall off as the flower of this plant grows bigger.', ' It grows in the arid regions of the southern Midwest along roadsides and disturbed pieces of land.', ' Native Americans have long revered this plant for its medicinal and other uses.']], ['Pinguicula moranensis', ['Pinguicula moranensis is a perennial rosette-forming insectivorous herb native to Mexico and Guatemala.', ' A species of butterwort, it forms summer rosettes of flat, succulent leaves up to 10\\xa0centimeters (4\\xa0in) long, which are covered in mucilaginous (sticky) glands that attract, trap, and digest arthropod prey.', ' Nutrients derived from the prey are used to supplement the nutrient-poor substrate that the plant grows in.', ' In the winter the plant forms a non-carnivorous rosette of small, fleshy leaves that conserves energy while food and moisture supplies are low.', ' Single pink, purple, or violet flowers appear twice a year on upright stalks up to 25 centimeters long.']], ['Phylica', ['Phylica is a genus of plants in the family Rhamnaceae.', ' It contains about 150 species, the majority of which are restricted to South Africa, where they form part of the \"fynbos \".', ' A few species occur in other parts of southern Africa, and on islands including Madagascar, the Mascarene Islands, Île Amsterdam, Saint Helena, Tristan da Cunha, and Gough Island.']], ['Salvia chamelaeagnea', ['Salvia chamelaeagnea is a species of flowering plant in genus \"Salvia\", known as sages.', ' It is endemic to South Africa, where it grows on the western coastline of the Cape of Good Hope.', ' It is a shrubby perennial herb up to 6 ft tall and 4 ft wide.', ' It bears 3/4 in light violet-blue flowers with pale lower lips and white throats.', ' The small, green leaves release a slight medicinal odor when brushed.', ' In the wild, the plant grows in sandy soil in streambeds, open fields, and roadsides.', ' It is cultivated for gardens.']], ['Chorizanthe watsonii', ['Chorizanthe watsonii is a species of flowering plant in the buckwheat family known by the common name fivetooth spineflower.', ' It is native to the western United States from Washington to the Mojave Desert.', ' It grows in many types of plant communities from desert scrub to woodland and sagebrush.', ' This small plant grows a woolly erect stem up to about 15 centimeters tall.', ' The inflorescence is a cluster of flowers surrounded by five hairy greenish bracts tipped with hooked awns.', ' The flower is about 2 millimeters wide and yellow in color.']]], 'type': 'comparison', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 11 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "\u001b[32m2025-12-09 17:51:51.902\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a7d109855429909bec7692f', 'answer': '1978', 'question': 'The director of Panic 5 Bravo was born in what year?', 'supporting_facts': [['Panic 5 Bravo', 0], ['Kuno Becker', 0]], 'context': [['Paul Bravo', ['Paul Bravo (born June 19, 1968 in Campbell, California) is a former American soccer midfielder and forward who played six seasons in Major League Soccer, two in the American Professional Soccer League and two in the USISL.', \" He also earned four caps, scoring one goal, with the United States men's national soccer team.\", ' After his retirement from playing, Bravo served for several years as an assistant coach in both Major League Soccer and the NCAA and was most recently Technical Director for the Colorado Rapids.']], ['Kuno Becker', ['Eduardo Kuno Becker Paz (born January 14, 1978) is a Mexican actor who has worked in telenovelas, Mexican cinema and U.S. cinema, but is best known for his portrayal of Ruben Berrizabal in \"Soñadoras\" and Santiago Muñez in the football movie \"Goal!', '\" and following sequels.']], ['Jake Sinclair (musician)', ['Jake Sinclair (born March 7, 1985) is an American record producer, audio engineer, mixing engineer, multi-instrumentalist, vocalist, and songwriter.', ' His production, engineering, songwriting, and mixing credits include Weezer, Fall Out Boy, Panic!', ' at the Disco, 5 Seconds of Summer, Pink, New Politics, Andrew McMahon in the Wilderness, Gin Wigmore, and Train.', ' Sinclair co-wrote and produced Panic!', ' at the Disco\\'s \"Death of a Bachelor\" album (which debuted at number one on the US Billboard 200) and produced Weezer\\'s 2016 \"Weezer (White Album)\".', ' Both were nominated for Best Rock Album at the 59th Annual Grammy Awards.', ' He co-wrote and produced \"Uma Thurman\" by Fall Out Boy, which debuted at number one on the U.S. iTunes Chart, reached number 22 on the Billboard Hot 100, and was certified 2X Platinum by the RIAA in December 2015.', ' Sinclair received a Grammy nomination for Album of the Year for his work as engineer and bassist on Taylor Swift\\'s \"Everything Has Changed\" alongside producer Butch Walker.', ' He co-wrote and produced the debut single, \"She Looks So Perfect\" by 5 Seconds of Summer that peaked at number one in over five countries and won \"Song of the Year\" at the 2014 ARIA Awards.', ' Sinclair is the former bassist of the indie/pop rock band The Films and the lead singer and producer of the indie pop duo Alohaha.']], ['C. E. Gatchalian', ['C.E. \"Chris\" Gatchalian (born June 5, 1974) is a Canadian playwright, born in Vancouver, British Columbia to Filipino parents, he holds an MFA in Creative Writing and Theatre from the University of British Columbia.', ' His play \"Motifs & Repetitions\" aired on Bravo!', ' (Canada) in 1997 and on the Knowledge in 1998.', ' His other produced plays include \"Claire\", \"Crossing\", \"Broken\" and \"People Like Vince\", a play for young audiences about mental health.', ' His latest play, \"Falling in Time\", had its world premiere in Vancouver in November 2011 and was published by Scirocco Drama in 2012.', \" In 2013 he won the Dayne Ogilvie Prize, a prize presented by the Writers' Trust of Canada to an openly LGBT writer.\"]], ['Rumen Petkov', ['Rumen Petkov (Bulgarian: Румен Петков ) (born 26 January 1948) is a Bulgarian animator and comic creator.', ' His influence spawned a new generation of young Bulgarian comic book artists as Vladimir Nedialkov, Koko Sarkisian, Ivan Kirjakov and others.', ' He was one of the main artists of the comics magazine DUGA (Rainbow), which was the most popular comics for several generations of Bulgarian children.', ' His most popular cartoon is \"The Adventures of Choko the Stork and Boko the Frog\" which was popular in Bulgaria during the 1970s and 1980s.', ' Other famous animated films he directed are \"Friends of Gosho the Elephant\", \"Treasure Planet\", etc.', \" He has won the Grand Prize at the Ottawa Animation Festival and the Palme d'Or at the Cannes Film Festival.\", ' Recently Rumen Petkov has worked as a writer, storyboard artist, animation director and director on some episodes of \"Johnny Bravo\", \"Dexter\\'s Laboratory\", \"Cow and Chicken\", \"I Am Weasel\", \"The New Woody Woodpecker Show\" and other series.', ' He has said about animation: \"Animation will never die because it\\'s like music, because it\\'s like running with the wind, because it\\'s funny.\"']], ['Stéphane Aubier', ['Stéphane Aubier (born October 8, 1964) is a Belgian film director and screenwriter.', ' In 2009, he wrote and directed the animated film \"A Town Called Panic\" along with Vincent Patar.', ' It premiered at the 2009 Cannes Film Festival and was the first stop-motion film to be screened at the festival.', ' In 2013, he co-directed with Patar and Benjamin Renner the film \"Ernest & Celestine\", which received widespread critical acclaim.', ' The film received three Magritte Awards, including Best Film and Best Director for Aubier and Patar.', ' It also received a nomination at the 86th Academy Awards, in the category of Best Animated Feature.']], ['Panic 5 Bravo', ['Panic 5 Bravo is an action-thriller film directed by Kuno Becker about American paramedics that become trapped on the Mexican side of the border and terrorized by a violent psychopath.', ' It was released in the U.S. by Pantelion Films.']], ['The Mins', ['The Mins are a Georgian Alternative / New Progressive Rock band established in 2011 by Zviad Mgebrishvili.', ' The band played its first live gig in 2011 on Altervision Newcomers.', ' After that the band started to work hard on their repertoire.', ' They mostly performs original songs and only rarely covers.', ' The main songwriter in the band is Zviad Mgebrishili.', ' Some songs are written by Shota Gvinepadze (keyboard) as well.', ' The band has four music videos on the following songs: \"Blind World\", \"O.W.L.\", \"My Lover is a Killer\" and \"I Don\\'t Give a Foot\".', ' Zviad Mgebrishvili was participating in the TV show \"Akhali Khma\" [\"The Voice of Georgia\"] in 2013 very successfully (5 stages).', ' The band has performed on many festivals and concerts.', ' The band had their first big solo concert in Tbilisi Eventhall 26 May 2014, where they had presentation of their first EP, named \"Blind World\" (released same year, included 5 songs).', ' The band has an honor to be warm up of \"Faithless\" (Tbilisi Summer Set 2014) and \"Archive\" (Tbilisi Open Air/Altervision 2015, where apart from Archive - Placebo, Beth Hart and Black Label Society were the headliners).', ' Zaza Mgebrishvili has left the band in 2015 and new bass player and backing vocal of the band is Nika Abesadze who used to play with Zviad Mgebrishvili early years in the university rock band \"Sunny Universe\".', ' The band is now recording their first album \"First Minute\" in the Bravo Records sound recording studio that will be released in the Winter of 2015.']], ['Vincent Patar', ['Vincent Patar (born 2 September 1965) is a Belgian film director and screenwriter.', ' In 2009 he wrote and directed the animated film \"A Town Called Panic\" along with Stéphane Aubier.', ' It premiered at the 2009 Cannes Film Festival and was the first stop-motion film to be screened at the festival.', ' In 2013 he co-directed with Aubier and Benjamin Renner the film \"Ernest & Celestine\", which received widespread critical acclaim.', ' The film received three Magritte Awards, including Best Film and Best Director for Patar and Aubier.', ' It also received a nomination at the 86th Academy Awards, in the category of Best Animated Feature, to be held on 2 March 2014.']], ['Ann Lewis (musician)', ['Ann Lewis (アン・ルイス , An Ruisu , born 5 June 1956 in Takarazuka, Hyōgo, Japan) is a Japanese singer, popular in Japan in the 1970s and 80s.', ' She was born to an American father and a Japanese mother.', ' She has one brother and a son, Myuji, who is also a singer in Japan.', ' She was married to Masahiro Kuwana, another Japanese singer, from 1980 to 1984.', ' Her many hits include the popular song \"Roppongi Shinju\", \"Good Bye My Love\" and many others which have been covered by other Asian artists.', ' She semi-retired from show-business in the 1990s, suffering from chronic panic attacks, and settled down in Los Angeles.', ' She released a few self-covers albums in the 2000s.', ' She has been active as a Creative Director, Consultant and Designer.', ' Works include Interior designs, (private homes to business offices, restaurants and shops), releasing a line of original jewelry, Creating original Animation, Logos and other projects.', ' She has also been involved as the President, COO and marketing consultant for several software companies in the USA.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 10 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 44%|████▍ | 220/500 [01:41<04:53, 1.05s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-09 17:51:52.024\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5ac4e9ab5542996feb3fe974', 'answer': 'Roger Jason Stone Jr.', 'question': 'which American political consultant was a former Trump campaign ', 'supporting_facts': [['Jack Posobiec', 3], ['Roger Stone', 0]], 'context': [['Mary Matalin', ['Mary Joe Matalin (born August 19, 1953) is an American political consultant well known for her work with the Republican Party.', ' She has served under President Ronald Reagan, was campaign director for George H. W. Bush, was an assistant to President George W. Bush, and counselor to Vice President Dick Cheney until 2003.', ' Matalin has been chief editor of Threshold Editions, a conservative publishing imprint at Simon & Schuster, since March 2005.', ' She is married to Democratic political consultant James Carville.', ' She appears in the award-winning documentary film \"\" and also played herself, opposite her husband, James Carville, John Slattery, and Mary McCormack in the short lived HBO series \"K Street\".']], ['Rick Davis (politics)', ['Richard H. \"Rick\" Davis, Jr. (born 1957) is an American political consultant.', ' He currently serves as a Partner and Chief Operating Officer of Pegasus Sustainable Century Merchant Bank, a private equity firm specializing in sustainable development projects.', ' He is a managing partner of the business development and public affairs consulting firm Davis-Manafort, located in Alexandria, Virginia.', \" He is best known for being the National Campaign Manager of John McCain's 2008 Presidential campaign (from April 25, 2007 to November 4, 2008).\", ' In that capacity, he oversaw the development and implementation of all campaign strategy and policy development.', ' Davis also served McCain as National Campaign Manager for his 2000 Republican Presidential Primary campaign ( April 6, 1999 to March 9, 2000).']], ['George Birnbaum', ['George E. Birnbaum is an American international political consultant.', ' He was raised in Atlanta, Georgia, and has worked on dozens of United States Congressional and Senatorial races.', ' In 1998 he moved to Israel to serve as a consultant to Prime Minister Benjamin Netanyahu, became his chief of staff, and afterwards formed a partnership with political consultant Arthur Finkelstein.', ' His work includes polling, strategy, paid media and grassroots coalition building, developing and implementing campaign strategies.', ' During his career, George Birnbaum has worked on campaigns on 5 continents and has helped elect over 15 Presidents and Prime Ministers worldwide.']], ['Roger Stone', ['Roger Jason Stone Jr. (born August 27, 1952) is an American political consultant, lobbyist, and strategist, noted for his use of opposition research usually for candidates of the Republican Party.']], ['Basket of deplorables', ['\"Basket of deplorables\" is a phrase from a 2016 presidential election campaign speech delivered by Democratic nominee Hillary Clinton on September 9, 2016, at a campaign fundraising event, which Clinton used to describe a faction of supporters of her general election opponent, Republican nominee Donald Trump.', ' Clinton later said that she \"regrets saying half [of Trump\\'s supporters]\", and the Trump campaign repeatedly used the phrase against her during and after the 2016 presidential election.', ' Many Trump supporters adopted the \"Deplorable\" moniker for themselves.', \" After Clinton's loss, some journalists and political analysts questioned whether or not the speech played a role in the election's outcome.\"]], ['Fred Karger', ['Fred S. Karger (born January 31, 1950) is an American political consultant, gay rights activist and watchdog, former actor, and politician.', ' His unsuccessful candidacy for the Republican nomination for the 2012 US Presidential election made him the first openly gay presidential candidate in a major political party in American history.', ' Although he has not held elected or public office, Karger has worked on nine presidential campaigns and served as a senior consultant to the campaigns of Presidents Ronald Reagan, George H. W. Bush and Gerald Ford.', ' Karger was a partner at the Dolphin Group, a California campaign consulting firm.', \" He retired after 27 years and has since worked as an activist on gay rights causes, from protecting the gay bar The Boom to using his organization Californians Against Hate to investigate The Church of Jesus Christ of Latter-day Saints (LDS Church) and the National Organization for Marriage's campaigns to repeal the state's same-sex marriage law.\"]], ['Jack Posobiec', ['Jack Posobiec ( ) is an American alt-right pro-Donald Trump Internet activist and conspiracy theorist, known primarily for his controversial comments on Twitter.', ' During the 2016 election, he was a special projects director of Citizens for Trump, a pro-Trump organization.', ' For two months in 2017, he was a correspondent for \"The Rebel\", a far-right Canada-based website.', ' He was granted press access to the White House in April 2017, and his tweets have been promoted by former Trump campaign manager Roger Stone.']], ['Dick Morris', ['Richard Samuel \"Dick\" Morris (born November 28, 1946) is an American political author and commentator who previously worked as a pollster, political campaign consultant, and general political consultant.']], ['Joseph Napolitan', ['Joseph Napolitan (March 6, 1929 – December 2, 2013) was an American political consultant, who worked as a general consultant on over 100 political campaigns in the United States, and many others throughout the world.', ' Napolitan served on the 1960 Kennedy for President campaign, was Director of Media for the 1968 Hubert Humphrey campaign, and received the French Legion of Honour in 2005.', ' He died on December 2, 2013 at the age of 84.']], ['Roger Ailes', ['Roger Eugene Ailes (May 15, 1940\\xa0– May 18, 2017) was an American television executive and media consultant.', ' He was the founder and one-time Chairman and CEO of Fox News and the Fox Television Stations Group, from which he resigned in July 2016 following allegations that he sexually harassed female colleagues.', \" Ailes was a media consultant for Republican presidents Richard Nixon, Ronald Reagan, and George H. W. Bush, and for Rudy Giuliani's first mayoral campaign.\", ' In 2016, after he left Fox News, he became an adviser to the Donald Trump campaign, where he assisted with debate preparation.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 10 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 44%|████▍ | 221/500 [01:41<04:02, 1.15it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-09 17:51:52.350\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a8ee6915542990e94052bad', 'answer': 'third', 'question': 'What season was the character introduced that becomes the main antagonist in the following season, from the animated television series created by Bryan Konietzko and Michael Dante DiMartino as a sequel to \"\", which aired from 2005 to 2008? ', 'supporting_facts': [['Kuvira', 0], ['Kuvira', 1], ['The Legend of Korra', 0], ['The Legend of Korra', 1]], 'context': [['Bolin (The Legend of Korra)', ['Bolin (愽林 , Bó Lín ) is a major fictional character in Nickelodeon\\'s animated television series \"The Legend of Korra\", which aired from 2012 to 2014.', ' The character and the series, a sequel to \"\", were created by Michael Dante DiMartino and Bryan Konietzko.', ' He is voiced by P. J. Byrne.', ' Bolin is able to manipulate the classical element of earth, which is known as earthbending.', ' It is revealed in the third season that he is also able to create and control lava, which is a very rare sub-ability called lavabending.']], ['Zaheer', ['Zaheer is a major recurring character in Nickelodeon\\'s animated television series \"The Legend of Korra\" (a sequel to \"\").', ' While he serves as the main antagonist of \"Book Three: Change\", his actions have lingering effects on Avatar Korra and the series\\' plot in the following book.', ' The character was created by Michael Dante DiMartino and Bryan Konietzko and is voiced by Henry Rollins.']], ['List of Avatar: The Last Airbender episodes', ['\"\" is a 61-episode American animated television series created by Michael Dante DiMartino and Bryan Konietzko.', ' It first aired on February 21, 2005, on Nickelodeon with a one-hour series premiere and concluded its run with a two-hour TV movie on July 19, 2008.', ' The \"Avatar: The Last Airbender\" franchise refers to each season as a \"Book\", in which each episode is referred to as a \"chapter\".', ' Each \"Book\" takes its name from one of the elements that Aang, the protagonist, must master: Water, Earth, and Fire.', \" The show's first two seasons each consisted of 20 episodes, while the third season had 21.\", ' In addition to the three seasons, there were two recap episodes and three \"shorts\".', ' The first recap summarized the first eighteen episodes while the second summarized season two.', ' The first self-parody was released via an online flash game.', ' The second and third were released with the Complete Second Season Box Set DVD.', ' The entire series has been released on DVD in Region One, Region Two and Region Four.']], ['Iroh', ['General Iroh (艾洛 , Aì Luò ) is a fictional character in Nickelodeon\\'s animated television series \"\".', ' Created by Michael Dante DiMartino and Bryan Konietzko, the character was voiced by Mako Iwamatsu in season one and season two and, due to Mako\\'s death, by Greg Baldwin, in season three and the sequel series \"The Legend of Korra\".']], ['Avatar: The Last Airbender (season 2)', ['Season Two (Book Two: Earth) of \"\", an American animated television series on Nickelodeon, first aired its 20\\xa0episodes from March 17, 2006 to December 1, 2006.', ' The season was created and produced by Michael Dante DiMartino and Bryan Konietzko, and starred Zach Tyler Eisen, Mae Whitman, Jack DeSena, Jessie Flower, Dante Basco, Dee Bradley Baker, Mako Iwamatsu and Grey DeLisle as the main character voices.']], ['Avatar: The Last Airbender (season 1)', ['Season one (Book One: Water) of \"\", an American animated television series produced by Nickelodeon Studios, aired 20 episodes from February 21, 2005 to December 2, 2005.', ' The series was created by Michael Dante DiMartino and Bryan Konietzko, and starred Zach Tyler Eisen, Mae Whitman, Jack DeSena, Dante Basco, Dee Bradley Baker, Mako Iwamatsu and Jason Isaacs as the main character voices.']], ['The Legend of Korra', ['The Legend of Korra is an American animated television series that aired on the Nickelodeon television network from 2012 to 2014.', ' It was created by Bryan Konietzko and Michael Dante DiMartino as a sequel to \"\", which aired from 2005 to 2008.', ' Animated in a style strongly influenced by anime, the series is set in a fictional universe in which some people can manipulate, or \"bend\", the elements of water, earth, fire, or air.', ' Only one person, the \"Avatar\", can bend all four elements, and is responsible for maintaining balance in the world.', ' The series follows Avatar Korra, the reincarnation of Aang from the previous series, as she faces political and spiritual unrest in a modernizing world.']], ['Kuvira', ['General Kuvira (古維拉 , Gǔ Wéi Lā ) is a fictional character and a character in \"The Legend of Korra\", created by Michael Dante DiMartino and Bryan Konietzko.', ' Introduced in the third season of the series, she becomes the main antagonist of the fourth season.', ' Kuvira was created with similar characteristics to the portrayal of protagonist Korra in prior seasons to highlight the changes she had made over the series.', \" Kuvira's character has been mostly met with positive reception.\", ' Critics note her motives as being understandable, while her actions are given political analogues.']], ['List of The Legend of Korra episodes', ['\"The Legend of Korra\" is an American animated television series created by Michael Dante DiMartino and Bryan Konietzko.', ' A sequel to \"\", the series first aired on Nickelodeon in 2012.', ' Like its predecessor, the series is set in a fictional world inspired by Asian and Inuit cultures, and inhabited by people who can manipulate the elements of water, earth, fire or air through an ability called \"bending.\"', ' One person, the \"Avatar,\" has the ability to bend all four elements.', \" Reincarnating in turn among the world's four nations, the Avatar is responsible for maintaining peace, harmony, and balance in the world.\", ' Korra, the series\\' protagonist, is the next incarnation of the Avatar after Aang of \"Avatar: The Last Airbender\".', ' Four seasons with a total of 52 episodes have aired.']], ['Avatar: The Last Airbender (season 3)', ['Season Three (Book Three: Fire) of \"\", an American animated television series on Nickelodeon, first aired its 21 episodes from September 21, 2007 to July 19, 2008.', ' The season was created by Michael Dante DiMartino and Bryan Konietzko, and starred Zach Tyler Eisen, Mae Whitman, Jack DeSena, Jessie Flower, Dante Basco, Dee Bradley Baker, Greg Baldwin, Grey DeLisle and Mark Hamill as the main character voices.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 12 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 44%|████▍ | 222/500 [01:41<03:28, 1.33it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-09 17:51:52.467\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a82edae55429966c78a6a9f', 'answer': '1986', 'question': 'Swiss music duo Double released their best known single \"The Captain of Her Heart\" in what year?', 'supporting_facts': [['Blue (Double album)', 1], ['Double (band)', 0]], 'context': [['Feargal Sharkey (album)', ['Feargal Sharkey is the first solo album of former Undertones singer Feargal Sharkey.', ' The album was released in 1985, peaking at #12 in the UK and contains Sharkey\\'s best known single \"A Good Heart\" his only No. 1.']], ['Devils Ball', ['\"Devils Ball\" is a song by Swiss duo Double, released as the lead single from their second studio album \"Dou3le\".', ' The single was released in 1987, and featured a guest appearance from Herb Alpert, who played trumpet on the track.']], ['The Captain of Her Heart', ['\"The Captain of Her Heart\" is a single by the Swiss duo Double in 1985.', ' Taken from their 1985 album \"Blue\", the song is a ballad about a girl who stops waiting for her absent lover to return.', ' The song was an international success, reaching No. 8 in the UK Singles Chart and No. 16 on the \"Billboard\" Hot 100.', ' The song also made Double the first Swiss act to hit the Top 40 in the Billboard Hot 100.']], ['Double (band)', ['Double (pronounced \"doo-blay\") was a Swiss music duo best known for their hit single \"The Captain of Her Heart\".']], ['Blue (Double album)', ['Blue is the first full-length album from Swiss band Double.', ' In addition to containing updated versions of two of the band\\'s earlier singles (\"Woman of the World\" and \"Rangoon Moon\"), the album included the international smash hit, \"The Captain of Her Heart\", a plaintive, atmospheric, piano-led ballad which was an immediate success throughout Europe upon its 1986 single release.', ' Follow-up singles \"Your Prayer Takes Me Off\" and \"Tomorrow\" were less successful.']], ['Kurt Maloo', ['Kurt Maloo (born Kurt Meier, April 16, 1953 in Zurich, Switzerland) is a Swiss singer-songwriter, composer, and record producer.', ' He first achieved international success in 1986, as the singer and front man of the duo Double with the hit single, \"The Captain of Her Heart\".']], ['Parno Graszt', ['Parno Graszt is a Roma (i.e. \"Gypsy\") music ensemble from Paszab, Hungary founded in 1987.', ' \"Parno Grast\" means \"white horse\" in the Romany language, with \"graszt\" using the Hungarian orthography \\'sz\\' for \\'s\\'.', ' In the Roma culture white is symbol of purity and horse is a symbol of freedom.', ' Their debut album \"Hit the piano\" reached Number 7 on the World Music Chart Europe in October 2002.', ' Hungarian Television and the BBC produced in 2004 a music documentary about Parno Graszt.', ' After their second album, \"Járom az utam\" (2004), Parno Graszt was voted in the top 10 for \"best artist of year\", 2005, by the Swiss music magazine \"Vibrations\".', ' In 2016, they competed in A Dal, the national final selection for Hungary in the Eurovision Song Contest with the song \"Már nem szédülök\", and reached the final.']], ['Move It Like This (song)', ['\"Move It Like This\" is a song recorded by the Bahamian pop group Baha Men.', ' It was released in February 2002 as the second single from the album, \"Move It Like This\".', ' The song reached number 13 on the New Zealand RIANZ list, number 13 on the Canadian Singles Chart and number 65 on the Swiss Music Charts.', ' The song was also featured on the 2002 compilation album \"Now That\\'s What I Call Music!', ' 10\".']], ['Stick Figure Neighbourhood', ['Stick Figure Neighbourhood was the first album by the Burlington band Spoons.', ' Released in 1981, it received some airplay on college stations, particularly the songs \"Conventional Beliefs\" and \"Red Light\".', ' It was their next album, \"Arias & Symphonies\", and its best known single, \"Nova Heart\", that were to launch them to fame.']], ['2005 in Swiss music', ['2005 was a big year for Swiss music, with the charts becoming steadier yet less predictable than they had been in previous years.', ' The year saw many chart debuts from both Swiss and international acts, and saw two novelty songs share a combined total of over ten weeks at the singles chart number one spot.', ' Internationally, the Swiss also saw Vanilla Ninja take the country to their best Eurovision Song Contest position in twelve years.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 12 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 45%|████▍ | 224/500 [01:42<02:11, 2.10it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:52:02.172\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 9 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 45%|████▌ | 225/500 [01:51<13:17, 2.90s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:52:02.389\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 9 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 45%|████▌ | 226/500 [01:52<09:51, 2.16s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:52:02.712\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 9 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 45%|████▌ | 227/500 [01:52<07:28, 1.64s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 46%|████▌ | 228/500 [01:55<09:20, 2.06s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:52:10.782\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 46%|████▌ | 229/500 [02:00<13:07, 2.91s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 46%|████▌ | 230/500 [02:02<11:37, 2.58s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 47%|████▋ | 234/500 [02:02<04:14, 1.05it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.18181818181818182, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 48%|████▊ | 239/500 [02:03<01:38, 2.65it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 48%|████▊ | 241/500 [02:03<01:16, 3.41it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 49%|████▊ | 243/500 [02:03<01:08, 3.75it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.15384615384615385, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.1, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 49%|████▉ | 245/500 [02:04<00:59, 4.28it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 49%|████▉ | 246/500 [02:04<00:54, 4.68it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 49%|████▉ | 247/500 [02:04<01:13, 3.43it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 50%|████▉ | 248/500 [02:05<01:08, 3.68it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.4444444444444445, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 50%|████▉ | 249/500 [02:05<01:03, 3.94it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 51%|█████ | 254/500 [02:05<00:28, 8.56it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.3636363636363636, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 52%|█████▏ | 259/500 [02:05<00:18, 12.91it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.19999999999999998, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.6153846153846153, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.11764705882352941, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 52%|█████▏ | 261/500 [02:08<01:43, 2.30it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 53%|█████▎ | 263/500 [02:08<01:20, 2.93it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 53%|█████▎ | 265/500 [02:09<01:11, 3.29it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.5454545454545454, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 54%|█████▍ | 271/500 [02:09<00:38, 5.90it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.4444444444444445, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 55%|█████▍ | 274/500 [02:10<00:52, 4.33it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8571428571428571, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 55%|█████▌ | 277/500 [02:11<00:38, 5.73it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 56%|█████▌ | 279/500 [02:11<00:34, 6.41it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 57%|█████▋ | 283/500 [02:11<00:29, 7.29it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 57%|█████▋ | 285/500 [02:12<00:26, 8.11it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 0.2222222222222222, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 58%|█████▊ | 288/500 [02:12<00:24, 8.77it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5625, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 58%|█████▊ | 290/500 [02:13<00:44, 4.73it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 58%|█████▊ | 292/500 [02:13<00:42, 4.84it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 59%|█████▉ | 294/500 [02:14<00:46, 4.39it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.2666666666666667, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 60%|█████▉ | 299/500 [02:14<00:21, 9.26it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.625, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 61%|██████ | 303/500 [02:15<00:22, 8.85it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 61%|██████▏ | 307/500 [02:15<00:29, 6.48it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 62%|██████▏ | 309/500 [02:26<05:38, 1.77s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-09 17:52:41.250\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a7322a25542991f9a20c634', 'answer': 'The Metropolitan Life Insurance Company Tower', 'question': 'Was the Metropolitan Life Insurance Company Tower [Met Life Tower] or the 15 Hudson Yards building designed by the firm of Napoleon LeBrun & Sons?', 'supporting_facts': [['Metropolitan Life Insurance Company Tower', 0], ['Metropolitan Life Insurance Company Tower', 1], ['15 Hudson Yards', 1]], 'context': [['Supreme Life Building', ['The Supreme Life Building is a historic insurance building located at 3501 S. Dr. Martin Luther King Drive in the Douglas community area of Chicago, Illinois.', ' Built in 1921, the building served as the headquarters of the Supreme Life Insurance Company, which was founded two years earlier.', ' The company, originally known as the Liberty Life Insurance Company, was the first African-American owned insurance company in the northern United States.', \" Since white-owned insurance firms regularly denied black customers life insurance when the firm was founded, the firm played an important role in providing life insurance to Chicago's African-American community.\", \" The company ultimately became the largest African-American owned business in the northern states and became a symbol of the predominantly black Bronzeville neighborhood's economic success from the 1920s to the 1950s.\"]], ['Napoleon LeBrun', [\"Napoleon Eugene Charles Henry LeBrun (January 2, 1821 – July 9, 1901) was an American architect known for several notable Philadelphia churches, in particular St. Augustine's Church on Fourth Street and the Cathedral-Basilica of Sts.\", ' Peter and Paul on Logan Square.', ' He also designed the Academy of Music at Broad and Locust Streets.', ' LeBrun later moved to New York City, where he established the firm Napoleon LeBrun & Sons, which designed numerous notable buildings.']], ['Metropolitan Life Insurance Company Tower', ['The Metropolitan Life Insurance Company Tower, colloquially known as the Met Life Tower, is a landmark skyscraper located on Madison Avenue near the intersection with East 23rd Street, across from Madison Square Park in Manhattan, New York City.', ' Designed by the architectural firm of Napoleon LeBrun & Sons and built by the Hedden Construction Company, the tower is modeled after the Campanile in Venice, Italy.', ' The hotel located in the clock tower portion of the building has the address 5 Madison Avenue, while the office building covering the rest of the block, occupied primarily by Credit Suisse, is referred to as 1 Madison Avenue.']], ['Hedden Construction Company', ['Some of the finest buildings in New Jersey, New York City, and other large eastern cities were built by the Hedden Construction Company, one of the largest construction companies operating in Newark in the very early 1900s.', ' Among the most notable is the Metropolitan Life Insurance Company Tower located at One Madison Avenue in New York, NY.', \" The tower was the world's tallest building from 1909 to 1913 and home to the Hedden Construction Company's main offices located on the 36th and 37th floors.\", ' During this prosperous period over $40,000,000 in construction contracts and payments were collected by the firm.']], ['15 Hudson Yards', [\"15 Hudson Yards is a residential building currently under construction on Manhattan's West Side.\", \" Located in Chelsea near Hell's Kitchen Penn Station area, the building is a part of the Hudson Yards project, a plan to redevelop the Metropolitan Transportation Authority's West Side Yards.\", ' The tower started construction on December 4, 2014.']], ['Flatiron Building', ['The Flatiron Building, originally the Fuller Building, is a triangular 22-story steel-framed landmarked building located at 175 Fifth Avenue in the borough of Manhattan, New York City, and is considered to be a groundbreaking skyscraper.', ' Upon completion in 1902, it was one of the tallest buildings in the city at 20 floors high and one of only two skyscrapers north of 14th Street – the other being the Metropolitan Life Insurance Company Tower, one block east.', \" The building sits on a triangular block formed by Fifth Avenue, Broadway, and East 22nd Street, with 23rd Street grazing the triangle's northern (uptown) peak.\", ' As with numerous other wedge-shaped buildings, the name \"Flatiron\" derives from its resemblance to a cast-iron clothes iron.']], ['Protective Life', ['Protective Life Corporation is a financial service holding company in Birmingham, Alabama.', ' The company’s primary subsidiary, Protective Life Insurance Company, was established in 1907 and now markets its products and services in all 50 states.', ' As of December 31, 2016, the corporation had more than 2,700 employees, annual revenues of $4.48 billion and assets of $75 billion.', \" In addition to Protective Life Insurance Company, Protective Life Corporation's subsidiaries include West Coast Life Insurance Company, MONY Life Insurance Company, Protective Life And Annuity Insurance Company, ProEquities Inc./Protective Securities, and Lyndon Property Insurance Company.\"]], ['Physicians Mutual', ['Physicians Mutual is a privately held insurance company headquartered in Omaha, Nebraska, United States, that consists of Physicians Mutual Insurance Company and Physicians Life Insurance Company.', ' Founded as Physicians Mutual Insurance Company in 1902 by Edwin E. Elliott, Physicians Mutual began by selling health insurance to medical professionals.', ' Policies were offered to the general public starting in 1962, and by 1970 the company expanded into life insurance when it founded Physicians Life Insurance Company.', ' Today the company offers a variety of insurance products, annuities, Medicare, Medigap, Medicare Supplement, Term Life Insurance, Whole Life Insurance, Cancer and funeral pre-planning services.', ' It holds over US$3 billion in assets and employs over one thousand people.', ' Robert A. Reed is chief executive officer and president.']], ['Lyceum Theatre (Park Avenue South)', ['The Lyceum Theatre was a theatre in New York City located on Fourth Avenue, now Park Avenue South, between 23rd and 24th Streets in Manhattan.', ' It was built in 1885 and operated until 1902, when it was torn down to make way for the Metropolitan Life Insurance Company Tower.', ' It was replaced by a new Lyceum Theatre on 45th Street.', ' For most of its existence, the theatre was home to Daniel Frohman’s Lyceum Theatre Stock Company, which presented many important plays and actors of the day.']], ['Metropolitan Life North Building', ['The Metropolitan Life North Building, now known as Eleven Madison, is a 30-story art deco skyscraper on Madison Square Park in Manhattan, New York City, at 11-25 Madison Avenue.', ' The building is bordered by East 24th Street, Madison Avenue, East 25th Street and Park Avenue South, and is connected by an elevated walkway to the Met Life Tower just south of it.', \" The North Building was built on the site of Richard Upjohn's original Madison Square Presbyterian Church.\", ' The second church, designed by Stanford White of McKim, Mead and White was built in 1906, across 24th street on land conveyed by Metropolitan Life.', ' As part of the Metropolitan Life Home Office Complex, the North Building was added to the National Register of Historic Places on January 19, 1996.']]], 'type': 'comparison', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 62%|██████▏ | 310/500 [02:30<06:49, 2.16s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-09 17:52:41.264\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a72e28f5542992359bc31ba', 'answer': 'outlined by Joel Greenblatt', 'question': 'Which technique did the director at Pzena Investment Management outline?', 'supporting_facts': [['Magic formula investing', 0], ['Joel Greenblatt', 3]], 'context': [['Joel Greenblatt', ['Joel Greenblatt (born December 13, 1957) is an American academic, hedge fund manager, investor, and writer.', ' He is a value investor, and adjunct professor at the Columbia University Graduate School of Business.', ' He is the former chairman of the board of Alliant Techsystems and founder of the New York Securities Auction Corporation.', ' He is also a director at Pzena Investment Management, a high-end value firm.']], ['Orbis Investment Management', ['Orbis Investment Management is an investment management firm headquartered in Bermuda, with offices in London, Vancouver, Sydney, San Francisco, Hong Kong, Tokyo and Luxembourg.', ' The company has a close relationship with Allan Gray Investment Management in South Africa and Allan Gray Australia.', ' Orbis manages approximately $25\\xa0billion on behalf of both institutional and individual investors.', ' Orbis Access, its direct-to-consumer platform, was launched in the UK in January 2015.']], ['Richard Pzena', ['Richard \"Rich\" Pzena (born January 8, 1959) is an American investment manager.', ' He is the founder and chief investment officer of Pzena Investment Management, a New York-based deep value investment firm with $26.4 billion in assets under management.']], ['Journal of Investment Management', ['The Journal of Investment Management (JOIM) is a quarterly refereed journal which seeks to be a nexus of theory and practice of investment management.', ' \"The Journal Of Investment Management\" offers in-depth research with practical significance utilising concepts from the economics and accounting disciplines.', ' The editor is Gifford H. Fong, founder of Gifford Fong Associates, a boutique bond and equity analysis firm.']], ['Separately managed account', ['A separately managed account (SMA) is a term within the investment management industry encompassing several different types of investment accounts.', ' For example, an SMA often is used to refer to an individual managed investment account often offered by a brokerage firm through one of their brokers or financial consultants and managed by independent investment management firms (often called money managers for short) and have varying fee structures.', ' These particular types of SMAs may be called \"wrap fee\" or \"dual contract\" accounts, depending on their structure.', ' There is no official designation for the SMA, but there are common characteristics that are represented in many types of SMA programs.', \" These characteristics include an open structure or flexible investment security choices; multiple money managers; and a customized investment portfolio formulated for a client's specific investment objectives or desired restrictions.\"]], ['Magic formula investing', ['Magic formula investing is a term referring to an investment technique outlined by Joel Greenblatt that uses the principles of value investing.']], ['Royal London Asset Management', ['Royal London Asset Management (RLAM) is a UK-based investment management company with assets under management of more than £101 billion.', ' Headquartered in London, United Kingdom, it has over 2,900 employees working across seven sites in UK and Ireland(as at 30 September 2016).', ' RLAM offers investment management – mutual funds, active and passive portfolio management as well asset allocation for a wide range of clients.', ' RLAM’s clients include, but are not limited to; listed companies, pension schemes, local authorities, educational establishments, charities, wealth managers, financial advisers and multi-managers.', ' RLAM invests across all major asset classes, including the UK and overseas equities, government bonds, investment grade and high yield corporate bonds, property and cash.', \" RLAM is a wholly owned, autonomous subsidiary of the Royal London Group, the UK's largest mutual insurance company.\"]], ['Cowen Group', ['Cowen Inc. is a diversified financial services firm that provides alternative investment management, investment banking, research, and sales and trading services through its two business segments: Cowen Investment Management (formerly Ramius LLC), a global alternative investment management business, and Cowen and Company, LLC, a broker-dealer business.', ' Founded in 1918 by Harry Cowen and Arthur Cowen, Jr., the Firm is headquartered in New York City and has offices located worldwide.']], ['Investment control', ['Investment control or investment controlling is a monitoring function within the asset management, portfolio management or investment management.', ' It is concerned with independently supervising and monitoring the quality of asset management accounts with the aim of ensuring performance and quality in order to provide the required benefit for the asset management client.', ' Dependent on setup, investment controlling not only encompasses controlling activities but also can include areas from compliance to performance review.', ' Investment controlling aspects can also be taken into consideration by asset management clients or investment advisers/consultants and consequently it is likely that these stakeholders also run certain investment controlling activities.']], ['Barclays Wealth', ['Barclays Wealth and Investment Management is a wealth manager providing private banking, investment management, brokerage and fiduciary services to private clients and financial intermediaries all over the world.', ' Barclays provides Wealth and Investment Management across 20 offices to clients in 50 countries and has client assets of £202.8\\xa0billion (as of 30 June 2013).']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "\u001b[32m2025-12-09 17:52:41.379\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 4 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 62%|██████▏ | 312/500 [02:31<04:33, 1.45s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:52:41.414\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5addc7e35542997545bbbdbe', 'answer': 'American Samoa, but not on all Native American tribal lands', 'question': 'Which areas of the United States were still able to deny sames sex marriages after the case in which Edith \"Edie\" Windsor was the main plaintiff?', 'supporting_facts': [['Edith Windsor', 1], ['Same-sex marriage in the United States', 0]], 'context': [['Same-sex marriage in the United States', ['In the United States, same-sex marriage is legal in all states, Washington, D.C., as well as all U.S. territories except American Samoa, but not on all Native American tribal lands, since June 26, 2015, when the United States Supreme Court ruled in \"Obergefell v. Hodges\" that state-level bans on same-sex marriage are unconstitutional.', ' The court ruled that the denial of marriage licenses to same-sex couples and the refusal to recognize those marriages performed in other jurisdictions violates the Due Process and the Equal Protection clauses of the Fourteenth Amendment of the United States Constitution.', ' The ruling overturned a 1972 precedent, \"Baker v. Nelson\".', \" Just prior to the Supreme Court's ruling in 2015, same-sex marriage was legal in many but not all U.S. jurisdictions.\"]], ['Gin Chow', ['Gin Chow (1857 - June 1933) was a Chinese immigrant who gained fame in California as a prophet and fortune teller able to predict the weather and other natural events.', ' Chow is credited with successfully predicting the 1925 Santa Barbara earthquake.', ' Chow was also the main plaintiff in the California Supreme Court case \"Gin Chow v. City of Santa Barbara\" which still ranks as one of the most important water rights cases in the state.']], ['Grant Commercial Historic District (Grant, Iowa)', ['The Grant Commercial Historic District is a nationally recognized historic district located in Grant, Iowa, United States.', ' It was listed on the National Register of Historic Places in 2002.', ' At the time of its nomination it contained 17 resources, which included 15 contributing buildings, two contributing structures, and one non-contributing building.', \" The historic district covers the town's central business district.\", ' Grant is a small town located in northeast Montgomery County in the southwest quadrant of the state.', ' It was plated in 1858, and it was known as Milford until the early 20th century even though its post office was Grant.', ' While not on a railroad, the town was still able to maintain a viable commercial district.']], ['Human trafficking in Taiwan', ['Taiwan is primarily a destination for men, women, and children trafficked for the purposes of forced labor and sexual exploitation.', ' It is also a source of women trafficked to Japan, Australia, the United Kingdom, and the United States.', ' Women and girls from the People’s Republic of China (P.R.C.) and Southeast Asian countries are trafficked to Taiwan through fraudulent marriages, deceptive employment offers, and illegal smuggling for sexual exploitation and forced labor.', ' Many trafficking victims are workers from rural areas of Vietnam, Thailand, Indonesia, and the Philippines, employed through recruitment agencies and brokers to perform low skilled work in Taiwan’s construction, fishing, and manufacturing industries, or to work as domestic servants.', ' Such workers are often charged high job placement and service fees, up to $14,000, resulting in substantial debt that labor brokers or employers use as a tool for involuntary servitude.', ' Many foreign workers remain vulnerable to trafficking because legal protections, oversight by authorities and enforcement efforts are inadequate.', ' Taiwan authorities reported that traffickers continued to use fraudulent marriages to facilitate labor and sex trafficking, despite increased efforts by the authorities to prevent this practice.', ' Some women who are smuggled onto Taiwan to seek illegal work were sometimes sold in auctions to sex traffickers, and subsequently forced to work in the commercial sex industry.', ' NGOs reported a sharp increase during the reporting period in the number of boys rescued from prostitution, mainly discovered during police investigations of online social networking sites suspected of being front operations for prostitution rings.']], ['Sea turtle migration', ['Sea turtle migration refers to the long-distance movements of sea turtles (superfamily Chelonioidea) notably as adults but may also refer to the offshore migration of hatchings.', ' Sea turtle hatchings emerge from underground nests and crawl across the beach towards the sea.', ' They then maintain an offshore heading until they reach the open sea.', ' The feeding and nesting sites of adult sea turtles are often distantly separated meaning some must migrate hundreds or even thousands of kilometres.', ' Several main patterns of adult migration have been identified.', ' Some such as the green sea turtle shuttle between nesting sites and coastal foraging areas.', ' The loggerhead sea turtle uses a series of foraging sites.', ' Others such as the leatherback sea turtle and olive ridley sea turtle do not show fidelity to any specific coastal foraging site.', ' Instead, they forage in the open sea in complex movements apparently not towards any goal.', ' Although the foraging movements of leatherbacks seem to be determined to a large part by passive drift with the currents, they are still able to return to specific sites to breed.', ' The ability of adult sea turtles to travel to precise targets has led many to wonder about the navigational mechanisms used.', \" Some have suggested that juvenile and adult turtles might use the Earth's magnetic field to determine their position.\", ' There is evidence for this ability in juvenile green sea turtles.']], ['Market share liability', ['Market share liability is a legal doctrine that allows a plaintiff to establish a prima facie case against a group of product manufacturers for an injury caused by a product, even when the plaintiff does not know from which defendant the product originated.', \" The doctrine is unique to the law of the United States and apportions liability among the manufacturers according to their share of the market for the product giving rise to the plaintiff's injury.\"]], ['Capron v. Van Noorden', ['Capron v. Van Noorden, 6 U.S. 126 (1804) , was a United States Supreme Court case in which the Court allowed a plaintiff to dismiss a case that he had lost at trial because of a lack of diversity jurisdiction, leaving the plaintiff free to bring the case again.']], ['Barnes v. Yahoo!, Inc.', ['Barnes v. Yahoo!, Inc., 570 F. 3d 1096 (D. Or.', ' Nov. 8, 2005), is a United States Court of Appeals for the Ninth Circuit case in which the Ninth Circuit held that Section 230 of the Communications Decency Act (CDA) rules that Yahoo!, Inc., as an Internet service provider cannot be held responsible for failure to remove objectionable content posted to their website by a third party.', \" Plaintiff Cecilia Barnes made claims arising out of Defendant Yahoo!, Inc.'s alleged failure to honor promises to remove offensive content about the plaintiff posted by a third party.\", ' The content consisted of a personal profile with nude photos of the Plaintiff and her contact information.', \" The United States District Court for the District of Oregon had dismissed Barnes' complaint.\"]], ['Lujan v. G & G Fire Sprinklers, Inc.', ['Lujan v. G & G Fire Sprinklers, Inc., 532 U.S. 189 (2001), was a United States Supreme Court case decided in 2001.', ' The case concerned a provision of the California Labor Code which allowed the state to withhold payment to contractors or subcontracters if found in breach of contract, without a specific hearing on the matter.', ' The Court upheld the provision because the companies were still able to pursue a claim in state court.']], ['Edith Windsor', ['Edith \"Edie\" Windsor (née Schlain; June 20, 1929 – September 12, 2017) was an American LGBT rights activist and a technology manager at IBM.', ' She was the lead plaintiff in the Supreme Court of the United States case \"United States v. Windsor\", which successfully overturned Section 3 of the Defense of Marriage Act and was considered a landmark legal victory for the same-sex marriage movement in the United States.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 5 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-09 17:52:41.433\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5addf6135542990dbb2f7f23', 'answer': 'Genderqueer', 'question': 'Which identifier: transwomen, cis women,or genderqueer, is a combination of masculinity and femininity or neither in gender expression? ', 'supporting_facts': [['Discwoman', 0], ['Genderqueer', 1]], 'context': [['Transgender', ['Transgender people are people who have a gender identity or gender expression that differs from their assigned sex.', ' Transgender people are sometimes called \"transsexual\" if they desire medical assistance to transition from one sex to another.', ' \"Transgender\" is also an umbrella term: in addition to including people whose gender identity is the \"opposite\" of their assigned sex (trans men and trans women), it may include people who are not exclusively masculine or feminine (people who are genderqueer/non-binary, e.g. bigender, pangender, genderfluid, or agender).', ' Other definitions of \"transgender\" also include people who belong to a third gender, or conceptualize transgender people as a third gender.', ' Infrequently, the term \"transgender\" is defined very broadly to include cross-dressers, regardless of their gender identity.']], ['Genderqueer', ['Genderqueer (GQ), also termed non-binary (NB), is a catch-all category for gender identities that are not exclusively masculine or feminineidentities which are thus outside the gender binary and cisnormativity.', ' Genderqueer people may express a combination of masculinity and femininity, or neither, in their gender expression.']], ['Transitioning (transgender)', [\"Transitioning is the process of changing one's gender presentation and/or sex characteristics to accord with one's internal sense of gender identity – the idea of what it means to be a man or a woman, or genderqueer (in-between).\", ' For transgender and transsexual people, this process commonly involves reassignment therapy (which may include hormone replacement therapy and sex reassignment surgery), with their gender identity being opposite that of their birth-assigned sex and gender.', ' Transitioning might involve medical treatment, but it does not always involve it.', ' For genderqueer people, it is neither solely female nor male.', ' Cross-dressers, drag queens, and drag kings tend not to transition, since their variant gender presentations are (usually) only adopted temporarily.']], ['Soft butch', ['A soft butch, or stem (stud-fem), is a woman who exhibits some stereotypical butch and lesbian traits without fitting the masculine stereotype associated with butch lesbians.', ' Soft butch is on the spectrum of butch, as are stone butch and masculine, whereas on the contrary, ultra fem, high femme, and lipstick lesbian are some labels on the spectrum of lesbians with a more prominent expression of femininity, also known as femmes.', ' Soft butches have gender identities of women, but primarily display masculine characteristics; soft butches predominantly express masculinity with a touch of femininity.', ' The \"hardness\", or label depicting one\\'s level of masculine expression as a butch is dependent upon the fluidity of her gender expression.', ' Soft butches might want to express themselves through their clothing and hairstyle in a more masculine way, but their behavior in a more traditionally feminine way.', ' For example, these traits of a soft butch may or may not include short hair, clothing that was designed for men, and masculine mannerisms and behaviors.', ' Soft butches generally appear androgynous, rather than adhering to strictly feminine or masculine norms and gender identities.', ' Soft butches generally physically, sexually, and romantically express themselves in more masculine than feminine ways in the majority of those categories.']], ['Femininity', ['Femininity (also called girlishness, womanliness or womanhood) is a set of attributes, behaviors, and roles generally associated with girls and women.', ' Femininity is socially constructed, but made up of both socially-defined and biologically-created factors.', ' This makes it distinct from the definition of the biological female sex, as both males and females can exhibit feminine traits.', ' People who exhibit a combination of both masculine and feminine characteristics are considered androgynous, and feminist philosophers have argued that gender ambiguity may blur gender classification.', ' Modern conceptualizations of femininity also rely not just upon social constructions, but upon the individualized choices made by women.']], ['Gender identity', [\"Gender identity is one's personal experience of one's own gender.\", ' Gender identity can correlate with assigned sex at birth, or can differ from it completely.', \" All societies have a set of gender categories that can serve as the basis of the formation of a person's social identity in relation to other members of society.\", ' In most societies, there is a basic division between gender attributes assigned to males and females, a gender binary to which most people adhere and which includes expectations of masculinity and femininity in all aspects of sex and gender: biological sex, gender identity, and gender expression.', ' In all societies, some individuals do not identify with some (or all) of the aspects of gender that are assigned to their biological sex; some of those individuals are transgender or genderqueer.', ' Some societies have third gender categories.']], ['Gender variance', ['Gender variance, or gender nonconformity, is behavior or gender expression by an individual that does not match masculine and feminine gender norms.', ' People who exhibit gender variance may be called \"gender variant\", \"gender non-conforming\", \"gender diverse,\" \"gender atypical\" or \"genderqueer\", and may be transgender or otherwise variant in their gender identity.', ' In the case of transgender people, they may be perceived, or perceive themselves as, gender nonconforming before transitioning, but might not be perceived as such after transitioning.', ' Some intersex people may also exhibit gender variance.']], ['Gender polarization', ['Gender polarization is a concept in sociology by American psychologist Sandra Bem which states that societies tend to define femininity and masculinity as polar opposite genders, such that male-acceptable behaviors and attitudes are not seen as appropriate for women, and vice versa.', ' The theory is an extension of the sex and gender distinction in sociology in which sex refers to the biological differences between men and women, while gender refers to the cultural differences between them, such that \"gender\" describes the \"socially constructed roles, behaviours, activities, and attributes that a given society considers appropriate for men and women\".', ' According to Bem, gender polarization begins when natural sex differences are exaggerated in culture; for example, women have less hair than men, and men have more muscles than women, but these physical differences are exaggerated culturally when women remove hair from their faces and legs and armpits, and when men engage in body building exercises to emphasize their muscle mass.', ' She explained that gender polarization goes further, when cultures construct \"differences from scratch to make the sexes even more different from one another than they would otherwise be\", perhaps by dictating specific hair styles for men and women, which are noticeably distinct, or separate clothing styles for men and women.', ' When genders become polarized, according to the theory, there is no overlap, no shared behaviors or attitudes between men and women; rather, they are distinctly opposite.', ' She argued that these distinctions become so \"all-encompassing\" that they \"pervade virtually every aspect of human existence\", not just hairstyles and clothing but how men and women express emotion and experience sexual desire.', ' She argued that male-female differences are \"superimposed on so many aspects of the social world that a cultural connection is thereby forged between sex and virtually every other aspect of human experience\".']], ['Discwoman', ['Discwoman is a New York based collective, booking agency, and event platform representing and showcasing female-identified (cis women, transwomen, and gender queer) talent in the electronic music community.', ' It was founded in 2014 by Frankie Decaiza Hutchinson who does the outreach for the agency dealing with Public Relations and social media, Emma Burgess-Olson (a.k.a. UMFANG) as the resident DJ, and Christine McCharen-Tran who is the event producer and business powerhouse.', \" Discwoman's regular club nights and touring events highlight emerging and established artists from around the world.\", ' Music produced by world-renowned female artists include The Black Madonna, Nicole Moudaber, Star Eyes, Sandunes, Demian Licht, and Nina Sonik whom have contributed to the electronic music culture.', ' The gender imbalance in EDM (electronic dance music) is self-evident showing women making up to ~10.8% of artists in electronic music festivals.', ' In a 2015 report by , it is stated that men comprised 82% of 44 international festivals’ lineups.', ' Discwoman gives feminine-identified talent the platform and more visibility by booking them at bigger venues, streamlining the growth process, and ensuring the artists they are paid what they are worth in a male-dominated dance music industry.']], ['Transmisogyny', ['Transmisogyny (sometimes trans-misogyny) is the intersection of transphobia and misogyny.', ' Transphobia is defined as \"the irrational fear of, aversion to, or discrimination against transgender or transsexual people\".', ' Misogyny is defined as \"a hatred of women\".', ' Therefore, transmisogyny includes negative attitudes, hate, and discrimination of transgender or transsexual individuals who fall on the feminine side of the gender spectrum.', ' The term was coined by Julia Serano in her 2007 book \"Whipping Girl\" and used to describe the unique discrimination faced by trans women because of \"the assumption that femaleness and femininity are inferior to, and exist primarily for the benefit of, maleness and masculinity\", and the way that transphobia intensifies the misogyny faced by trans women (and vice versa).', ' The term discusses how many trans women experience an additional layer of misogyny in the form of fetishization; Serano talks about how society views trans women in certain ways that sexualize them, such as them transitioning for sexual reasons, or ways where they’re seen as sexually promiscuous.Transmisogyny is a central concept in transfeminism and is commonly seen in intersectional feminist theory.', \" The suggestion that trans women's femaleness (rather than their femininity) is a source of transmisogyny is rejected by some feminists, who do not regard trans women as female.\"]]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 4 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-09 17:52:41.656\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 4 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 63%|██████▎ | 315/500 [02:31<02:43, 1.13it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:52:41.675\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a8f55f9554299458435d5bd', 'answer': 'actor', 'question': 'What profession did Willi Forst and Elmer Clifton share?', 'supporting_facts': [['Willi Forst', 0], ['Elmer Clifton', 0]], 'context': [['Kaiserjäger (film)', ['Kaiserjäger is a 1956 Austrian film directed by Willi Forst.']], ['Gently My Songs Entreat', ['Gently My Songs Entreat (German: Leise flehen meine Lieder ) is a 1933 Austrian-German musical film directed by Willi Forst and starring Marta Eggerth, Luise Ullrich and Hans Jaray.', ' Art direction was by Julius von Borsody.', ' The film is a biopic of the composer Franz Schubert (1797–1828).', \" It was Forst's directorial debut.\", ' A British version was made called \"Unfinished Symphony\".', ' The German title refers to the first line of the Lied \"Ständchen\" (Serenade) from Schubert\\'s collection \"Schwanengesang\", \"the most famous serenade in the world\", which Eggerth performs in the film.']], ['Operetta (film)', ['Operetta (German: Operette) is a 1940 musical film directed by Willi Forst and starring Forst, Maria Holst and Dora Komar.', ' The film was made by Wien-Film, a Vienna-based company set up after Austria had been incorporated into Greater Germany following the 1938 Anschluss.', ' It is the first film in director Willi Forst\\'s \"Viennese Trilogy\" followed by \"Vienna Blood\" (1942) and \"Viennese Girls\" (1945).', ' The film portrays the life of Franz Jauner (1832–1900), a leading musical figure in the city.', ' It is both an operetta film and a Wiener Film.']], ['Elmer Clifton', ['Elmer Clifton (March 14, 1890 – October 15, 1949) was an American writer, director and actor from the early silent days.', ' A collaborator of D.W. Griffith, he appeared in \"The Birth of a Nation\" (1915) and \"Intolerance\" (1916) before giving up acting in 1917 to concentrate on work behind the camera, with Griffith and Joseph Henabery as his mentors.', ' His first feature-length solo effort as a director was \"The Flame of Youth\" with Jack Mulhall.']], ['Miracles Still Happen (1951 film)', ['Miracles Still Happen (German: Es geschehen noch Wunder) is a 1951 West German romantic comedy film directed by Willi Forst and starring Forst, Hildegard Knef and Marianne Wischmann.', ' It was intended by Forst as a more harmless follow-up to his controversial \"Die Sünderin\" which had also starred Knef.']], ['The Prince of Arcadia', ['The Prince of Arcadia (German: Der Prinz von Arkadien) is a 1932 Austrian-German romance film directed by Karl Hartl and starring Willi Forst, Liane Haid and Hedwig Bleibtreu.', ' It premiered on 18 May 1932.']], ['Burgtheater (film)', ['Burgtheater is a 1936 Austrian drama film directed by Willi Forst.', ' Most of the film was shot in the Burgtheater in Vienna.']], ['Viennese Girls', ['Viennese Girls (German:Wiener Mädeln) is a 1945 historical musical film directed by Willi Forst and starring Forst, Anton Edthofer and Judith Holzmeister.', ' The film was made by Wien-Film, a Vienna-based company set up after Austria had been incorporated into Greater Germany following the 1938 Anschluss.', ' It was the third film in Forst\\'s \"Viennese Trilogy\" which also included \"Operetta\" (1940) and \"Vienna Blood\" (1942).', ' The film was finished in 1945, during the closing days of the Second World War.', ' This led to severe delays in its release, which eventually took place in 1949 in two separate versions.', ' One was released by the Soviet-backed Sovexport in the Eastern Bloc and the other by Forst.']], [\"A Student's Song of Heidelberg\", [\"A Student's Song of Heidelberg (German:Ein Burschenlied aus Heidelberg) is a 1930 German musical film directed by Karl Hartl and starring Hans Brausewetter, Betty Bird and Willi Forst.\", \" It marked Hartl's directoral debut.\", ' The film is in the tradition of the nostalgic Old Heidelberg.']], ['Willi Forst', ['Willi Forst, born Wilhelm Anton Frohs (7 April 1903 – 11 August 1980) was an Austrian actor, screenwriter, film director, film producer and singer.', ' As a debonair actor he was a darling of the German-speaking film audiences, as a director, one of the most significant makers of the Viennese period musical melodramas and comedies of the 1930s known as \"Wiener Filme\".', ' From the mid-1930s he also recorded many records, largely of sentimental Viennese songs, for the Odeon Records label owned by Carl Lindström AG.']]], 'type': 'comparison', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 4 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-09 17:52:41.788\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5adf3f1d5542992d7e9f9310', 'answer': 'Italian composer', 'question': 'What profeesion do Giacomo Benvenuti and Claudio Monteverdi share?', 'supporting_facts': [['Giacomo Benvenuti', 0], ['Claudio Monteverdi', 0]], 'context': [['Giacomo Badoaro', ['Giacomo Badoaro (1602–1654) was a Venetian nobleman and amateur poet.', ' He is most famous for writing the libretto for Claudio Monteverdi\\'s opera \"Il ritorno d\\'Ulisse in patria\" (1640).', ' He also provided librettos for the operas \"Ulisse errante\" by Francesco Sacrati (1644) and \"Elena rapita da Teseo\" (1653) by Jacopo Melani.', ' He was a member of the Venetian intellectual circle, the Accademia degli Incogniti.']], ['Claudio Monteverdi', ['Claudio Giovanni Antonio Monteverdi (] ; 15 May 1567 (baptized) – 29 November 1643) was an Italian composer, string player and choirmaster.', ' A composer of both secular and sacred music, and a pioneer in the development of opera, he is considered a crucial transitional figure between the Renaissance and the Baroque periods of music history.']], ['Giacomo Benvenuti', ['Giacomo Benvenuti (16 March 1885, Toscolano — 20 January 1943, Barbarano-Salò) was an Italian composer and musicologist.', ' He was the son of organist Cristoforo Benvenuti and studied at the Liceo Musicale in Bologna under Luigi Torchi (musicology) and Marco Enrico Bossi (organ).', ' In 1919 his collection of songs for voice and piano accompaniment, \"Canti a una voce : con accompagnamento di pianoforte\", was published in Bologna.', ' In 1922 he published a collection of 17th-century art songs entitled \"35 Arie di vari autori del secolo XVII\".', ' Composer Samuel Barber studied the works of Giulio Caccini, Andrea Falconieri, and other early Italian composers under his tutelage in Milan in 1933-1934.', ' For the Teatro dell\\'Opera di Roma he adapted Claudio Monteverdi\\'s \"L\\'Orfeo\" for a production which premiered on 27 December 1934.', ' The adaptation was later used for the first recording of \"L\\'Orfeo\" in 1939, which included a performance by the orchestra of La Scala Milan under conductor Ferrucio Calusio.']], [\"Il ritorno d'Ulisse in patria\", ['Il ritorno d\\'Ulisse in patria (SV 325, \"The Return of Ulysses to his Homeland\") is an opera consisting of a prologue and five acts (later revised to three), set by Claudio Monteverdi to a libretto by Giacomo Badoaro.', ' The opera was first performed at the Teatro Santi Giovanni e Paolo in Venice during the 1639–1640 carnival season.', ' The story, taken from the second half of Homer\\'s \"Odyssey\", tells how constancy and virtue are ultimately rewarded, treachery and deception overcome.', ' After his long journey home from the Trojan Wars Ulisse, king of Ithaca, finally returns to his kingdom where he finds that a trio of villainous suitors are importuning his faithful queen, Penelope.', ' With the assistance of the gods, his son Telemaco and a staunch friend Eumete, Ulisse vanquishes the suitors and recovers his kingdom.']], [\"L'Orfeo discography\", ['These lists show the audio and visual recordings of the opera \"L\\'Orfeo\" by Claudio Monteverdi.', ' The opera was first performed in Mantua in 1607, at the court of Duke Vincenzo Gonzaga, and is one of the earliest of all operas.', ' The first recording of \"L\\'Orfeo\" was issued in 1939, a freely adapted version of Monteverdi\\'s music edited by Giacomo Benvenuti, given by the orchestra of La Scala Milan conducted by Ferrucio Calusio.', ' In 1949 the Berlin Radio Orchestra under Helmut Koch recorded the complete opera, on long-playing records (LPs).', ' The advent of LP recordings was, as Harold Schonberg later wrote, an important factor in the postwar revival of interest in Renaissance and Baroque music, and from the mid-1950s recordings of \"L\\'Orfeo\" have been issued on many labels.', \" Koch's landmark version was reissued in 1962, when it was compared unfavourably with others that had by then been issued.\", ' The 1969 recording by Nicholas Harnoncourt and the Vienna Concentus Musicus, using Harnoncourt\\'s edition based on period instruments, was praised for \"making Monteverdi\\'s music sound something like the way he imagined\".', ' In 1981 Siegfried Heinrich, with the Early Music Studio of the Hesse Chamber Orchestra, recorded a version which re-created the original Striggio libretto ending, adding music from Monteverdi\\'s 1616 ballet \"Tirsi e Clori\" for the Bacchante scenes.', ' Among more recent recordings, that of Emmanuelle Haïm has been praised for its dramatic effect.', ' The 21st century has seen the issue of an increasing number of recordings on DVD.']], ['John Whenham', ['John Whenham is an English musicologist and academic who specializes in early Italian baroque music.', ' He earned both a Bachelor of Music and a Master of Music from the University of Nottingham, and a Doctor of Philosophy from the University of Oxford.', ' He is a leading expert on the life and works of Claudio Monteverdi, and is the author of the books \"Duet and Dialogue in the Age of Monteverdi\" (Ann Arbor, Michigan: University Microfilms International, 1982) \"Monteverdi, \\'Orfeo\\' \" (London: Cambridge University Press, 1986), \"Monteverdi, Vespers (1610)\" (Cambridge University Press, 1997), and \"The Cambridge Companion to Monteverdi\" (with Richard Wistreich, Cambridge University Press, 2007).', ' For five years he was co-editor of the journal \"Music & Letters\".', ' He currently serves on the board of the Birmingham Early Music Festival and is head of the music history department at the University of Birmingham.']], ['Ricciardo Amadino', ['Ricciardo Amadino (\"fl.\"', ' 1572–1621) was a Venetian printer.', ' He briefly attempted to publish music on his own in 1579, but was unsuccessful.', ' He joined with Giacomo Vincenti, with whom he published over 80 books between 1583 and 1586.', ' Many of these were reprints of popular madrigal books, but some were first printings.', ' Their partnership ended around 1586, but they continued to work together occasionally.', ' After 1586, Amadino\\'s mark was a woodcut of an organ, and he printed primarily music, with a few theoretical treatises, including the first edition of Ercole Bottrigari\\'s \"Il desiderio\".', ' He printed editions of such important composers as Luca Marenzio and Claudio Monteverdi, including the celebrated 1609 edition of \"L\\'Orfeo\", and in terms of sheer output was one of the foremost Italian music printers.']], ['Sergio Vartolo', ['Sergio Vartolo (Bologna, 1944) is an Italian harpsichordist, organist, musicologist and conductor; in past also active as countertenor.', ' In 1996 he was appointed maestro de capella of the Cappella Musicale di San Petronio di Bologna founded in 1436.', ' He has an extensive discography, both as a harpsichordist - the complete works of Girolamo Frescobaldi, and as a conductor - particularly works by Giovanni Paolo Colonna and Giacomo Antonio Perti associated with San Petronio, but also operas by Claudio Monteverdi and others.']], ['Stattkus-Verzeichnis', ['The Stattkus-Verzeichnis (SV) is a catalogue of the musical compositions of the Italian composer Claudio Monteverdi.', ' The catalogue was published in 1985 by Manfred H. Stattkus (\"Claudio Monteverdi: Verzeichnis der erhaltenen Werke\").', ' A free, basic second edition of the catalogue is available online.']], ['Monteverdi (crater)', ['Monteverdi is a crater on Mercury.', ' It has a diameter of 138 kilometers.', ' Its name was adopted by the International Astronomical Union in 1979.', ' Monteverdi is named for the Italian composer Claudio Monteverdi, who lived from 1567 to 1643.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 4 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 64%|██████▍ | 319/500 [02:31<01:31, 1.97it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-09 17:52:41.788\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5ac245015542992f1f2b3829', 'answer': 'Louis \"Louie\" Zamperini', 'question': 'Who was a Christian Evangelist and US prisoner of war survivor that was the basis for a film directed by Angelina Jolie?', 'supporting_facts': [['Unbroken (film)', 1], ['Louis Zamperini', 0]], 'context': [['Cyborg 2', ['Cyborg 2, released in some countries as Glass Shadow, is a 1993 American science fiction action film directed by Michael Schroeder and starring Elias Koteas, Angelina Jolie, Billy Drago, Karen Sheperd and Jack Palance.', ' It is an unrelated sequel to the 1989 film \"Cyborg\", although footage from the original is used in a dream sequence.', ' It was also Jolie\\'s film debut in a starring role (she had previously made an earlier film, \"Lookin\\' to Get Out\", as a child actress).', ' It was followed by the 1995 direct-to-video release \"\".']], ['Unbroken (film)', ['Unbroken is a 2014 American war film produced and directed by Angelina Jolie, written by the Coen brothers, Richard LaGravenese, and William Nicholson, based on the 2010 non-fiction book by Laura Hillenbrand, \"\".', ' The film revolves around the life of USA Olympian and army officer Louis \"Louie\" Zamperini.', ' Zamperini survived in a raft for 47 days after his bomber crash landed in the ocean during World War II, then was sent to a series of prisoner of war camps.']], ['Salt (2010 film)', ['Salt is a 2010 American action thriller film directed by Phillip Noyce, written by Kurt Wimmer, and starring Angelina Jolie, Liev Schreiber, Daniel Olbrychski, August Diehl, and Chiwetel Ejiofor.', ' Jolie plays Evelyn Salt, who is accused of being a Russian sleeper agent and goes on the run to try to clear her name.']], ['Angelina Jolie filmography', ['Angelina Jolie is an American actress and filmmaker.', ' As a child, she made her screen debut in the 1982 comedy film \"Lookin\\' to Get Out\", acting alongside her father Jon Voight.', ' Eleven years later she appeared in her next feature, the low-budget film \"Cyborg 2\", a commercial failure.', ' She then starred as a teenage hacker in the 1995 science fiction thriller \"Hackers\", which went on to be a cult film despite performing poorly at the box-office.', ' Jolie\\'s career prospects improved with a supporting role in the made-for-television film \"George Wallace\" (1997), for which she received the Golden Globe Award for Best Supporting Actress – Television Film.', ' She made her breakthrough the following year in HBO\\'s television film \"Gia\" (1998).', ' For her performance in the title role of fashion model Gia Carangi, she won the Golden Globe Award for Best Actress – Television Film.']], ['Gone in 60 Seconds (1974 film)', ['Gone in 60 Seconds is a 1974 American action film written, directed, produced by, and starring H.B. \"Toby\" Halicki.', ' It centers on a group of car thieves and the 48 cars they must steal in a matter of days.', ' The film is known for having wrecked and destroyed 93 cars in a 40-minute car chase scene.', ' This film is the basis for the 2000 remake starring Nicolas Cage and Angelina Jolie.']], ['In the Land of Blood and Honey', ['In the Land of Blood and Honey is a 2011 American war film written, produced, and directed by Angelina Jolie and starring Zana Marjanović, Goran Kostić, and Rade Šerbedžija.', \" The film, Jolie's first commercial release as a director, depicts a love story set against the background of the Bosnian War.\", ' It opened in the United States on December 23, 2011, in a limited theatrical release.']], ['By the Sea (2015 film)', ['By the Sea is a 2015 American romantic drama film written and directed by Angelina Jolie, and produced by and starring Jolie and Brad Pitt.', ' The film was released on November 13, 2015, by Universal Pictures.']], ['Angelina Jolie', ['Angelina Jolie Pitt ( ; née Voight; born June 4, 1975) is an American actress, filmmaker, and humanitarian.', \" She has received an Academy Award, two Screen Actors Guild Awards, and three Golden Globe Awards, and has been cited as Hollywood's highest-paid actress.\", ' Jolie made her screen debut as a child alongside her father, Jon Voight, in \"Lookin\\' to Get Out\" (1982).', ' Her film career began in earnest a decade later with the low-budget production \"Cyborg 2\" (1993), followed by her first leading role in a major film, \"Hackers\" (1995).', ' She starred in the critically acclaimed biographical cable films \"George Wallace\" (1997) and \"Gia\" (1998), and won an Academy Award for Best Supporting Actress for her performance in the drama \"Girl, Interrupted\" (1999).']], ['First They Killed My Father (film)', ['First They Killed My Father (Khmer: មុន\\u200bដំបូង\\u200bខ្មែរ\\u200bក្រហម\\u200bសម្លាប់\\u200bប៉ា\\u200bរបស់\\u200bខ្ញុំ \"Moun\\u200b dambaung\\u200b Khmer\\u200b Krahm\\u200b samleab\\u200b ba\\u200b robsa\\u200b khnhom\") is a 2017 biographical historical thriller film directed by Angelina Jolie and written by Jolie and Loung Ung, based on Ung\\'s memoir of the same name.', ' Set in 1975, the film depicts 5-year-old Ung who is forced to be trained as a child soldier while her siblings are sent to labor camps during the Khmer Rouge regime.']], ['Louis Zamperini', ['Louis Silvie \"Louie\" Zamperini (January 26, 1917 – July 2, 2014) was a US prisoner of war survivor in World War II, a Christian evangelist and an Olympic distance runner.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 4 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "\u001b[32m2025-12-09 17:52:42.142\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a74547755429979e2882900', 'answer': 'Texas A&M Aggies football', 'question': 'the head football coach at the University of Houston from 2007 to 2011, is the current team coach of which football team ?', 'supporting_facts': [['Texas A&M Aggies football', 0], ['Texas A&M Aggies football', 4], ['Kevin Sumlin', 1]], 'context': [['Willie Fritz', ['Willie Fritz (born April 2, 1960) is an American football coach and former player.', ' He is the current head coach at Tulane University.', ' From 2014 to 2015, he was head coach at Georgia Southern University.', ' From 2010 to 2013, he was the head football coach at Sam Houston State University.', ' From 1997 to 2009, Fritz served as the head football coach at the University of Central Missouri.', ' From 1993 to 1996, he was the head football coach at Blinn College, a junior college in Brenham, Texas.']], ['Carl Anderson (American football)', ['Carl Rudolph Frederick \"Swede\" Anderson IV (September 9, 1898 – April 30, 1978) was an American college football coach at Western Kentucky University and Howard Payne University.', ' Anderson graduated from Centre College in Danville, Kentucky in 1924, where he played in the backfield with legendary alumnus Bo McMillin.', ' Anderson then followed McMillin to Centenary College of Louisiana and Geneva College.', ' Anderson then served one year as the head football coach at Western Kentucky, before moving to Kansas State as its freshman team coach in 1930.', ' Anderson returned to Western Kentucky as its head coach from 1934 to 1937.', ' He was the backfield coach under McMillin at Indiana from 1938 to 1945.', ' He then returned to his alma mater, Centre College, where he coached the Praying Colonels until 1950.', ' The following season, Anderson became the seventh head football coach at the Howard Payne University in Brownwood, Texas and held that position from 1951 to 1952.', ' His coaching record at Howard Payne was 7–10.']], ['Tom Keele', ['Tom Keele (born c. 1933) is a former American football coach.', ' He served as the head football coach at California State University, Northridge from 1979 to 1985, compiling a record of 31–42–1.', ' Keele graduated from Jefferson High School in Portland Oregon in 1951.', ' He attended the University of Oregon, where he played football for the Oregon Webfoots as a tackle from 1957 to 1959.', ' Keele began his coaching career in 1960 at North Eugene High School in Eugene, Oregon, working two years as an assistant football coach and sophomore basketball coach.', ' He moved to Oregon City High School in Oregon City, Oregon in 1962, serving as head football coach and leading his team to a 9–1–1 record.', ' The following year, he was hired as head football coach at the newly-formed Sheldon High School in Eugene.']], ['Tim Landis', ['Timothy Joseph \"Tim\" Landis (born July 13, 1964) is an American football coach who is currently quarterbacks coach and special teams coordinator at Lycoming College.', ' Previously, Landis was the head coach for the Rensselaer Polytechnic Institute football team.', ' He was also formerly the offensive coordinator for the San Jose State Spartans football team and the head football coach for Bucknell University.', ' He compiled a 23–33 record at Bucknell since 2003 and a 76–85–1 record overall.', \" Prior to arriving at Bucknell, Landis served as head football coach at Davidson and St. Mary's.\"]], ['Kevin Sumlin', ['Kevin Warren Sumlin (born August 3, 1964) is an American football coach and former player who is the head coach at Texas A&M University.', ' Previously, Sumlin was the head football coach at the University of Houston from 2007 to 2011.']], ['Robert P. Wilson', ['Robert P. \"Bert\" Wilson was an American football player and coach.', \" He played football for Wesleyan University and was captain of the school's football team in 1896.\", \" After graduating, he served as Wesleyan's first head football coach from 1898 to 1902.\", \" In five years as Wesleyan's coach, Wilson compiled a record of 25–21–2.\", ' In his first two years as the coach, Wesleyan compiled records of 7–3 and 7–2.', \" In the 17 years before Wilson took over as the coach, Wesleyan's football team had never won seven games in a single season.\", ' In 1903, Wilson became the head football coach at New York University (NYU).', ' He served the sixth head football coach at NYU and held that position for one season, in 1903, leading the NYU Violets to a record of 2–5.']], ['Ernest T. Jones', ['Ernest T. Jones (born January 18, 1970) is the current head coach at ASA Miami, a two-year college starting its first football season in 2015.', ' He was briefly running backs coach for the University of Connecticut Huskies football team.', ' He was head football coach at Alcorn State University.', ' He was named the head football coach after the 2007 season and served as head coach in 2008.', ' He was controversially fired from this position in December 2008.', ' He returned to the University of Cincinnati as the Director of Player Services in 2009.', ' For the 2010 he will be an assistant coach at the University at Buffalo under former University of Cincinnati assistant coach and now UB head football Coach Jeff Quinn.']], ['K. C. Keeler', ['Kurt Charles \"K. C.\" Keeler (born July 26, 1959) is an American football coach and former player.', ' He is currently the head football coach at Sam Houston State University.', ' He was the head football coach at the University of Delaware from 2002 to 2012.', ' Keeler served as the head football coach at Rowan University from 1993 to 2001.', \" His 2003 Delaware Fightin' Blue Hens squad won the NCAA Division I-AA Football Championship, and returned to the Division I Championship game in 2007 and 2010.\"]], ['Butch Davis', ['Paul Hilton \"Butch\" Davis, Jr. (born November 17, 1951) is an American football coach.', ' He is the head football coach at Florida International University.', ' After graduating from the University of Arkansas, he became an assistant college football coach at Oklahoma State University and the University of Miami before becoming the defensive coordinator for the Dallas Cowboys of the National Football League (NFL).', \" He was head coach of the University of Miami's Hurricanes football team from 1995 to 2000 and the NFL's Cleveland Browns from 2001 to 2004.\", ' Davis served as the head coach of the University of North Carolina at Chapel Hill (UNC) Tar Heels football team from 2007 until the summer of 2011, when a series of National Collegiate Athletic Association (NCAA) investigations resulted in his dismissal.', \" He was hired by the NFL's Tampa Bay Buccaneers as an advisor in February 2012.\"]], ['Texas A&M Aggies football', ['The Texas A&M Aggies football program represents Texas A&M University in the sport of American football.', ' The Aggies compete in the Football Bowl Subdivision (FBS) of the National Collegiate Athletic Association (NCAA) and the Western Division of the Southeastern Conference (SEC).', ' Texas A&M football claims three national titles and eighteen conference titles.', ' The team plays all home games at the newly redeveloped Kyle Field, a 102,733-person capacity outdoor stadium on the university campus.', \" Kevin Sumlin is currently the team's head coach.\"]]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 4 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 64%|██████▍ | 322/500 [02:31<01:07, 2.62it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-09 17:52:42.171\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a796bfd55429970f5fffeac', 'answer': 'A simple iron boar crest', 'question': 'What adorns the archaeological artefact excavated by Thomas Bateman on 3 May 1848?', 'supporting_facts': [['Pioneer Helmet', 2], ['Benty Grange helmet', 0]], 'context': [['May Assembly', ['May Assembly (Serbian: Мајска скупштина / Majska skupština ) was the national assembly of the Serbs in Austrian Empire, held on 1 and 3 May 1848 in Sremski Karlovci, during which the Serbs proclaimed autonomous Serbian Vojvodina.', ' This action was later recognized by the supreme Austrian authority in Vienna.', ' May Assembly was part of the European Revolutions of 1848.']], ['Artognou stone', ['The Artognou stone, sometimes referred to as the Arthur stone, is an archaeological artefact uncovered in Cornwall in the United Kingdom.', ' It was discovered in 1998 in securely dated sixth-century contexts among the ruins at Tintagel Castle in Cornwall, a secular, high status settlement of sub-Roman Britain.', ' It appears to have originally been a practice dedication stone for some building or other public structure, but it was broken in two and re-used as part of a drain when the original structure was destroyed.', ' Upon its discovery the stone achieved some notoriety due to the suggestion that \"Artognou\" was connected to the legendary King Arthur, though scholars such as John Koch have criticized the evidence for this connection.']], ['Slatino furnace model', ['The Slatino furnace model is an ancient ceramic artefact excavated at an archeological site near Slatino in Bulgaria.', ' It was found among the remains of a burned down dwelling dated from the Eneolithic period (ca. 5000 BCE).', \" The description 'furnace model' (and name) has been adopted in the absence of a definite idea about the use and meaning of the object.\", ' On its largest flat side there is a clearly traced rough']], ['Five Wells', ['Five Wells is a chambered tomb and scheduled ancient monument on Taddington Moor in the Peak District.', ' Three stones mark the main chamber, which has been dramatically reduced; a second less well-preserved chamber is to the west.', ' Access can be had on foot via a permitted path from Pillwell Gate to the west.', ' The mound was excavated by Thomas Bateman in 1846.']], ['Pioneer Helmet', ['The Pioneer Helmet (also known as Wollaston Helmet or Northamptonshire Helmet) is a 7th-century Anglo-Saxon boar-crested helm found by archaeologists from Northamptonshire Archaeology at a quarry site operated by Pioneer Aggregates.', ' This helmet is very similar in its basic design to the Coppergate Helmet, although it is much larger, and was likely to have had two cheek plates (of which only one remained) and a nasal (which was bent inwards at the time of deposition to render the piece unwearable).', ' A simple iron boar crest adorns the top of this helmet associating it with the Benty Grange helmet and the Guilden Morden boar from the same period, and descriptions in the poem Beowulf.', ' The helmet accompanied the burial of a young male, possibly laid on a bed with a pattern welded sword, small knife, hanging bowl, three iron buckles and a copper alloy clothes hook.']], ['Benty Grange helmet', ['The Benty Grange helmet is an archaeological artefact excavated by Thomas Bateman on 3 May 1848 from an Anglo-Saxon tumulus (or \"barrow\") at the Benty Grange Farm in the civil parish of Monyash in the English county of Derbyshire.']], ['Jewellery', ['Jewellery (British English) or jewelry (American English) consists of small decorative items worn for personal adornment, such as brooches, rings, necklaces, earrings, pendants and bracelets. Jewellery may be attached to the body or the clothes, and the term is restricted to durable ornaments, excluding flowers for example.', ' For many centuries metal, often combined with gemstones, has been the normal material for jewellery, but other materials such as shells and other plant materials may be used.', ' It is one of the oldest type of archaeological artefact – with 100,000-year-old beads made from \"Nassarius\" shells thought to be the oldest known jewellery.', ' The basic forms of jewellery vary between cultures but are often extremely long-lived; in European cultures the most common forms of jewellery listed above have persisted since ancient times, while other forms such as adornments for the nose or ankle, important in other cultures, are much less common.']], ['Morphology (archaeology)', ['Morphology in archaeology, the study of shapes and forms, and their grouping into period styles remains a crucial tool, despite modern techniques like radiocarbon dating, in the identification and dating not only of works of art but all classes of archaeological artefact, including purely functional ones (ignoring the question of whether purely functional artefacts exist).', ' The term morphology (\"study of shapes\", from the Greek) is more often used for this.', ' Morphological analyses of many individual artefacts are used to construct typologies for different types of artefact, and by the technique of seriation a relative dating based on shape and style for a site or group of sites is achieved where scientific absolute dating techniques cannot be used, in particular where only stone, ceramic or metal artefacts or remains are available, which is often the case.', ' That artefacts such as pottery very often survive only in fragments makes precise knowledge of morphology even more necessary, as it is often necessary to identify and date a piece of pottery from only a few sherds.']], ['Ngườm', ['Ngườm is an archaeological site in Thái Nguyên Province, northern Vietnam.', ' It is a rock shelter in a limestone cliff near the Thần Sa River that was excavated in 1981 by archaeologists from the Vietnam Institute of Archaeology.', ' Flaked stone artefacts have been found in deposits containing shells with radiocarbon ages of 23,000 years ago.', ' The site is important because of its unusually high proportion of retouched flakes in the stone artefact assemblage, relative to other sites in Southeast Asia.']], ['Joe Bell Site', ['The Joe Bell Site (9MG28) is an archaeological site located in Morgan County, Georgia underneath Lake Oconee, but prior to the 1970s, it was located south of the mouth of the Apalachee River on the western bank of the Oconee River.', ' The junction of these two rivers could be seen from the site.', ' This site was first visited by Marshall Williams in 1968 at the suggestion of the site’s landowner, Joe Bell, who had discovered various artifacts while the site was being regularly plowed.', ' Because of Interstate construction, Marshall Williams and Mark Williams discovered this site during surface surveys and excavations of the plowed areas.', ' The site was excavated and analyzed by Mark Williams as part of his PhD dissertation.', ' During the 1969 excavations, four areas within the site were designated for excavation.', ' In Areas 1-3 various five foot square units were excavated.', ' No excavations were done in Area 4 in 1969.', ' Large quantities of small potsherds were discovered during these excavations, and they ranged from the Duvall Phase in Area 1 to Bell Phase in Areas 2-4.', ' As part of the 1969 excavations, a road grading machine took off the topsoil of twelve strips on the site.', ' This uncovered Features 1 and 2, and they were completely excavated.', ' In 1977, the site was revisited by Marshall Williams and Mark Williams.', ' Since various plans threatened this site, major excavations took place from June 15, 1977 until September 16, 1977 by Mark Williams.', ' Most of the work centered on Area 2 or the Bell Phase portion of the site.', ' The Bell Phase portion of this site was probably no more the 1.5 acres .', ' Because of time constraints, only 17 of 55 features were excavated, and no more than a handful of the 1100 posts were excavated.', ' A few trips were made back to the site the following year with the help of volunteers, and approximately 80% of the area stripped by heavy machinery was mapped.', ' Some of the features were trash features that consisted of a circular pit filled with food residues and pottery sherds.', ' Evidence of a large circular structure or rotunda was found at the site.', ' It was the social, political, and religious center for the inhabitants.', ' A large quantity of the features was small, circular, semi-subterranean structures that were probably used as sleeping quarters on cold nights.', ' Another structure found was warm weather structures.', ' One major trash feature was found that had been deposited in a single episode and was burned during or after deposition.', ' Numerous sherds were found in this pit, and a large number of reconstructable vessels were present.', ' Ethnohistoric literature of the Southeast suggests that this feature was formed during a Busk or Green Corn ceremony.', ' The ceremony has been described as the physical cleansing of the town.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 4 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-09 17:52:42.204\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5adc65e85542996e68525350', 'answer': 'no', 'question': 'Are both Dafeng District and Dazhou located in the same province?', 'supporting_facts': [['Dafeng District', 0], ['Dazhou', 0]], 'context': [['Sichuan–Shanghai gas pipeline', ['Sichuan–Shanghai gas pipeline () is a 1702 km long natural gas pipeline in China.', ' The pipeline runs from Pugang gas field in Dazhou, Sichuan Province, to Qingpu District of Shanghai.', ' An 842 km long branch line connects Yichang in Hubei with Puyang in Henan Province.', ' Two shorter branches are located near the Puguang gas field and one in the east near Shanghai.']], ['Yandu District', ['Yandu District () is one of three districts of Yancheng, Jiangsu province, China.', ' (The other two are Tinghu District and Dafeng District).']], ['Dazhou', ['Dazhou () is a prefecture-level city in the northeast corner of Sichuan province, China, bordering Shaanxi to the north and Chongqing to the east and south.', ' 2002 population was 384,525.']], ['Dafeng District', ['Dafeng () is a coastal district under the administration of Yancheng, Jiangsu province, China.', ' Located on the Jiangsu North Plain with a coastline of 112 km , Dafeng was historically one of the largest salt-making areas in China and now is famed for its well preserved eco-system and numerous national conservation parks.', \" The district has the largest national nature reserve for a rare deer species, Père David's Deer or Milu (麋鹿 ) in Chinese.\", ' It borders the prefecture-level city of Taizhou to the southwest.']], ['Tinghu District', ['Tinghu District () is one of three districts of Yancheng, Jiangsu province, China.', ' (The other two are Yandu District and Dafeng District).', ' Prior to 2004, Tinghu District was called the Urban District ()of Yancheng.']]], 'type': 'comparison', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 4 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "\u001b[32m2025-12-09 17:52:42.216\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5ae0c9dd5542993d6555ec69', 'answer': 'yes', 'question': 'Are Rob Parissi and Robert Pollard both musicians', 'supporting_facts': [['Rob Parissi', 0], ['Robert Pollard', 0]], 'context': [['Play That Funky Music', ['\"Play That Funky Music\" is a song written by Rob Parissi and recorded by the band Wild Cherry.', ' The single was the first release by the Cleveland-based Sweet City record label in April 1976, and distributed by Epic Records.', \" The performers on the recording included lead singer Parissi, electric guitarist Bryan Bassett, bassist Allen Wentz and drummer Ron Beitle, with session players Chuck Berginc, Jack Brndiar (trumpets), and Joe Eckert and Rick Singer (saxes) on the horn riff that runs throughout the song's verses.\", ' The single hit number one on the \"Billboard\" Hot 100 on September 18, 1976, and was also number one on the Hot Soul Singles chart.', ' The single was certified platinum by the Recording Industry Association of America for shipments of over 2 million records, eventually selling 2.5 million in the United States alone.']], ['Get Down Tonight: The Disco Explosion', ['Get Down Tonight: The Disco Explosion was a 2004 musical documentary special which aired on PBS.', ' The special featured Irene Cara, KC & The Sunshine Band, Yvonne Elliman, The Hues Corporation, Peaches & Herb, Karen Lynn Gorney, A Taste of Honey, Rob Parissi of Wild Cherry, Leo Sayer, Deney Terrio, Frankie Valli, Martha Wash, Barry Williams, Norma Jean Wright and Felton Pilate.', ' It was directed by T.J. Lubinsky, and produced by Jerry Blavat, Henry J. DeLuca, Cousin Brucie Morrow and Lubinsky.', ' One of the associate producers was Marty Angelo.']], ['Robert Pollard', ['Robert Ellsworth Pollard Jr. (born October 31, 1957) is an American musician and singer-songwriter who is the leader and creative force behind indie rock group Guided by Voices.', ' In addition to his work with Guided by Voices, he continues to have a prolific solo career with 22 solo albums released so far.']], ['The Crawling Distance', ['The Crawling Distance is 11th studio album released by singer-songwriter Robert Pollard on January 20, 2009.', ' Similar to many of Pollard\\'s releases since \"Fiction Man\" in 2004, all instrumentation on the album was performed by producer Todd Tobias.', ' \"The Crawling Distance\" has a 64/100 score on metacritic and thus was Pollard\\'s lowest rated album on the site, until 2011\\'s \"Space City Kicks\" which has a 62.', ' ']], ['Rob Parissi', ['Robert \"Rob\" Parissi is an American singer, songwriter and guitarist, perhaps best known as frontman for the American funk group Wild Cherry, best known for their 1976 Parissi-penned chart-topper \"Play That Funky Music\".', ' He was born in 1950 and raised in the steel mill town of Mingo Junction, Ohio.', ' He graduated from Mingo High School in 1968.', ' Rob formed the band Wild Cherry in 1970 in Steubenville, Ohio, one mile north of Mingo Junction along the Ohio River.', ' The band played the Ohio Valley region, Wheeling, West Virginia and the rest of the Northern West Virginia panhandle, and Pittsburgh, Pennsylvania.']], ['Choreographed Man of War', ['Choreographed Man of War is an album by Robert Pollard and the Soft Rock Renegades, released in 2001.', ' The album features Robert Pollard (vocals, guitar), Greg Demos (bass), and Jim Macpherson (drums).']], ['Elephant Jokes', [\"Elephant Jokes is the 12th studio album released by singer-songwriter Robert Pollard on August 11, 2009, and the 8th full-length album to be released by Pollard (along with several EP's and singles) since the break-up of his band Guided by Voices in 2004.\", ' Unlike recent Pollard albums, Todd Tobias does not play all the instruments on \"Elephant Jokes\", as Pollard plays some guitar on this album.']], ['Weatherman and Skin Goddess', ['Weatherman and Skin Goddess is a limited EP from singer-songwriter Robert Pollard.', \" Only 1,000 CDs and 500 12 inch LPs were put into production and were made available exclusively on Pollard's website.\", \" Released on April 15, this marks the first release from Robert Pollard's record label Guided by Voices Inc.\"]], ['Kid Marine', ['Kid Marine is 3rd album by Robert Pollard, released in 1999.', \" It is the first release of Robert Pollard's Fading Captain Series.\", ' Pollard has stated that the album is about Jeff \"Kid Marine\" Davis, the person pictured on the cover .', ' Robert told Mojo magazine, \"My personal favorite, a weird record, almost a concept album, about the typical Ohio male and what he does - drink, watch television, eat pizza.', \" It got mixed reviews, there are people who hate it and others who think it's our best record and I'm on their side.\", ' I just love the songs.', ' It feels like one piece, like it all fits together.', ' I like the cover and I like the']], ['Robert Pollard Is Off to Business', ['Robert Pollard Is Off to Business is 10th studio album released by singer-songwriter Robert Pollard on June 2, 2008.', ' This is the first LP release from Robert Pollard\\'s new record label \"Guided by Voices Inc\".', ' All instrumentation on the album was performed by producer Todd Tobias.', ' Many of the songs on the album were over three minutes in length, which is unusual for a Pollard release.']]], 'type': 'comparison', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 4 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-09 17:52:42.275\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 4 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 65%|██████▌ | 326/500 [02:31<00:43, 4.01it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:52:46.362\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 17 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:52:52.933\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 10 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 66%|██████▌ | 328/500 [02:42<03:49, 1.33s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 66%|██████▌ | 329/500 [02:43<03:34, 1.26s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:53:02.789\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 9 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:53:02.800\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 9 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 66%|██████▌ | 331/500 [02:52<06:05, 2.16s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:53:02.890\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 9 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:53:02.916\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 9 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "\u001b[32m2025-12-09 17:53:02.918\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 9 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 67%|██████▋ | 333/500 [02:52<04:22, 1.57s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metricsmetrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", " {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:53:02.978\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:53:03.271\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 8 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 67%|██████▋ | 336/500 [02:52<02:47, 1.02s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:53:03.292\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 8 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:53:03.335\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 8 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:53:03.364\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 8 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:53:03.402\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 8 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 68%|██████▊ | 340/500 [02:53<01:37, 1.64it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:53:03.413\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 8 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:53:03.546\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 3 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 68%|██████▊ | 342/500 [02:53<01:16, 2.06it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:53:03.556\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 8 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:53:05.834\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 69%|██████▉ | 344/500 [02:55<01:41, 1.54it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.2222222222222222, 'em': 0.0, 'acc': 1.0}\n", "\u001b[32m2025-12-09 17:53:06.399\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 5 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 69%|██████▉ | 346/500 [02:56<01:25, 1.81it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:53:06.498\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 5 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 70%|██████▉ | 348/500 [03:02<03:07, 1.23s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 70%|██████▉ | 349/500 [03:02<02:40, 1.07s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "\u001b[32m2025-12-09 17:53:12.773\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 70%|███████ | 351/500 [03:02<01:53, 1.31it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 71%|███████ | 353/500 [03:02<01:19, 1.84it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 71%|███████ | 356/500 [03:03<00:50, 2.87it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 72%|███████▏ | 358/500 [03:03<00:52, 2.69it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 72%|███████▏ | 359/500 [03:04<00:49, 2.87it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.3333333333333333, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 72%|███████▏ | 362/500 [03:04<00:32, 4.19it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 73%|███████▎ | 366/500 [03:04<00:19, 7.03it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.8571428571428571, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 74%|███████▍ | 371/500 [03:05<00:12, 9.92it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6363636363636364, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 75%|███████▍ | 373/500 [03:05<00:17, 7.27it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.14285714285714288, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 75%|███████▌ | 375/500 [03:06<00:17, 7.00it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 76%|███████▌ | 379/500 [03:06<00:14, 8.57it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 76%|███████▌ | 381/500 [03:09<00:52, 2.25it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 77%|███████▋ | 384/500 [03:09<00:36, 3.18it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 77%|███████▋ | 387/500 [03:09<00:25, 4.49it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.75, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.4444444444444445, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 78%|███████▊ | 391/500 [03:10<00:16, 6.76it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 79%|███████▊ | 393/500 [03:11<00:27, 3.86it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 79%|███████▉ | 395/500 [03:11<00:20, 5.00it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 79%|███████▉ | 397/500 [03:11<00:17, 6.00it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 80%|████████ | 400/500 [03:12<00:19, 5.11it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5454545454545454, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 81%|████████ | 403/500 [03:12<00:17, 5.56it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 81%|████████ | 405/500 [03:12<00:17, 5.50it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.3333333333333333, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 82%|████████▏ | 410/500 [03:13<00:09, 9.75it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.3076923076923077, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 82%|████████▏ | 412/500 [03:13<00:09, 9.17it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 83%|████████▎ | 414/500 [03:14<00:12, 7.01it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.13333333333333333, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 83%|████████▎ | 417/500 [03:14<00:12, 6.49it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 84%|████████▍ | 419/500 [03:15<00:15, 5.15it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.24000000000000002, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 84%|████████▍ | 422/500 [03:15<00:13, 5.66it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.3, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 85%|████████▍ | 423/500 [03:15<00:13, 5.85it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 85%|████████▍ | 424/500 [03:15<00:15, 5.06it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 86%|████████▌ | 431/500 [03:16<00:05, 11.58it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 87%|████████▋ | 433/500 [03:27<00:05, 11.58it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:53:41.567\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a8b3d795542997f31a41cc1', 'answer': '!!!', 'question': 'Which band included more previously-known figures when it was formed, !!! or Puddle of Mudd?', 'supporting_facts': [['!!!', 0], ['!!!', 2], ['!!!', 3], ['Puddle of Mudd', 0], ['Puddle of Mudd', 1], ['Puddle of Mudd', 2], ['Puddle of Mudd', 3]], 'context': [['Stuck (EP)', [\"Stuck is Puddle of Mudd's debut EP.\", ' The band had played a local battle of the bands competition and won the grand prize, the chance to record an EP.', ' It was recorded at Red House in Lawrence, KS in 1993, and was released on Mudd Dog/V&R Records in 1994.', ' The MuddDog version is among the rarest collectibles in the history of Puddle of Mudd.', ' \"Stuck\" featured the original version of \"Drift and Die\", which was later included on the band\\'s 2001 \"Come Clean\" album.', ' Puddle of Mudd is currently working on re-releasing the EP.', ' The cover art for \"Stuck\" was designed by a Kansas City based graphic arts studio named \"River City Studio\" owned by Deb Turpin.', ' The invoice for designing the cover art was never paid.']], ['Come Clean (Puddle of Mudd album)', ['Come Clean is the second studio album by the rock band Puddle of Mudd.', \" Released on August 28, 2001, the album's music was responsible for breaking Puddle of Mudd into the mainstream music scene.\", ' It features the singles \"Control,\" \"Blurry,\" \"Drift & Die\" and \"She Hates Me\".', ' Various tracks were re-recorded from the band\\'s previous releases, \"Stuck\" and \"Abrasive\".', ' The album reached the Billboard 200 Albums chart peaking at #9.']], ['!!!', ['!!!', ' ( ) is an American dance-punk band that formed in Sacramento, California, United States, in 1996 by lead singer Nic Offer.', ' Members of !!!', ' came from other local bands such as The Yah Mos, Black Liquorice and Popesmashers.', ' They are currently based in New York City.', ' The band\\'s seventh album, \"Shake the Shudder\", was released in May 2017.']], ['Adam Latiff', ['Adam Latiff (born March 24, 1979) is a lead guitarist, rhythm guitarist, songwriter, and vocalist for a number bands, most notable for Puddle of Mudd.', ' He started his career in bands such as Devereux and was a touring guitar player for Eve to Adam until December 2014.', ' Latiff is the lead vocalist and lead guitarist for a national Nirvana tribute band called Heart Shaped Box, and is also the lead vocalist for Vanilla Women, which features former members of Shinedown Cold and Puddle of Mudd.']], ['Adelitas Way', ['Adelitas Way is an American hard rock band formed in Las Vegas, Nevada in 2006.', ' The band\\'s debut single \"Invincible\", broke them into the mainstream scene after the song made numerous television appearances in commercials and live sporting events.', \" As of 2017, the band has toured with notable acts such as Shinedown, Guns N' Roses, Creed, Papa Roach, Godsmack, Theory of a Deadman, Seether, Three Days Grace, Breaking Benjamin, Deftones, Puddle of Mudd, Sick Puppies, Staind, Alter Bridge, Skillet, Halestorm, Thousand Foot Krutch and others.\"]], ['She Hates Me', ['\"She Hates Me\", sometimes colloquially referred to as \"She Fucking Hates Me\", is a song by the band Puddle of Mudd.', ' It was written in 1993 and released in 2002.', ' It continued the group\\'s popularity on the \"Billboard\" Hot 100, peaking at number 13, though not as successful as the number 5 hit \"Blurry\".', ' It also topped the \"Billboard\" Hot Mainstream Rock Tracks chart for one week in October.', ' The popularity of \"She Hates Me\" made it become Puddle of Mudd\\'s second single to sell over 500,000 copies in the United States, following \"Blurry\".', \" The song peaked at number 14 in the UK Singles Chart, making it the group's third Top 20 hit and won a 2004 ASCAP Pop Music Award.\"]], ['Stoned (Puddle of Mudd song)', ['\"Stoned\" is the second single off the album \"\" by rock band Puddle of Mudd.', ' The song was available for download on iTunes and online music retail sites on December 8, 2009, and released to radio on March 8, 2010.', ' Stoned was the #1 most added track at Active Rock as soon as it impacted radio, with 60+ new stations coming aboard in a week.', ' The song was written by Puddle of Mudd front-man Wes Scantlin.']], ['Soulidium', ['Soulidium was an American hard rock band formed in Tampa, Florida, United States, in 2006, currently consisting of frontman Michael McKnight, guitarist Braeden Lane, bassist Bobby \"Fuzzy\" Farrell, and drummer Eric Dietz.', ' Under their original line-up, the band released their debut album, \"Children of Chaos\" in mid-2007.', ' The band has toured many well-known bands, including Sevendust, Alice in Chains, Limp Bizkit, Alter Bridge, Puddle of Mudd, Hellyeah, Black Light Burns and Nonpoint.', ' Numerous years after entering into a period of inactivity while attempting to release their sophomore album, initially titled \"Fly 2 the Sun, around mid-2011, it was finally released, now re-titled \"Awaken\" in late 2015.', ' As of 2017, the band is disbanded.']], ['Best of Puddle of Mudd', ['Best of Puddle of Mudd is the first \"best of\" collection from the band Puddle of Mudd.', ' It was released on November 2, 2010 as part of Universal Music Enterprises\\'s \"Icon\" Series of Compilation Albums.', ' It contains tracks from their first four major label albums.']], ['Puddle of Mudd', ['Puddle of Mudd is an American rock band formed in 1991.', ' To date, the band has sold over seven million albums and has had a string of No. 1 mainstream rock singles in the United States.', ' Their major-label debut \"Come Clean\" has sold over five million copies.', ' They have released two independent and four major albums, with their latest being \"\" in December 2009, and their most recent compilation album being \"\", released in August 2011.']]], 'type': 'comparison', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 4 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 87%|████████▋ | 434/500 [03:31<01:49, 1.67s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-09 17:53:41.595\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5ab28e2a5542993be8fa9947', 'answer': 'Eddie \"The Eagle\" Edwards', 'question': 'Who holds the world record for jumping over 6 buses and appeared on the British television series \"The Jump\"?', 'supporting_facts': [['The Jump', 4], ['Eddie "The Eagle" Edwards', 1]], 'context': [['Åsarna IK', ['Åsarna IK, founded in 1924, is a Swedish sports club in Åsarna.', ' The club has had many prominent competitors in cross country skiing, which is evident in the nickname of the village Åsarna, \"Guldbyn\" (golden village), which was coined after the 1988 Winter Olympics when three out of the four cmpetitors in the men\\'s relay competition came from this place.', ' Åsarna IK has also spawned prominent track and field athletes.', ' Anton Bolinder (b. 1915), who started jumping in a gravel pit in Åsarna, became the European champion in high jump in 1946 (jumping 1,99 m), and runner John Isberg broke the junior world record for 1500 m five times in the 1940s.', ' By the time of their international breakthroughs, both Bolinder and Isberg hade changed clubs to IFK Östersund.', ' Bolinder became Swedish champion twice in high jump.', ' In 2015 a book about Åsarna IK will appear.']], ['Vera Olenchenko', ['Vera Olenchenko (born March 21, 1959) is a Soviet born athlete.', ' While she was one of the best long jumpers in the world, she did not make it beyond domestic competition in the prime of her career dominated by a crop of top long jumpers including Olympic champion Tatyana Kolpakova, world record holder Galina Chistyakova, Tatyana Skachko, Yelena Belevskaya, Tatyana Rodionova, Nijolė Medvedeva, Irina Valyukevich and Larysa Berezhna.', ' Following the breakup of the Soviet Union, and the following creation of new republics, Vera was credited with the indoor long jump record for Uzbekistan, which she still holds at 6.82m.', ' While most of her contemporaries disappeared from the scene, Olenchenko continued jumping and made it to an international championship, not representing Uzbekistan but Russia at the 1997 world indoor championships.', ' Her lifetime best was 6.92 from 1985, which ranks tied for the 96th best of all time.', ' But she nearly duplicated that with a 6.90m on June 14, 1996.', ' At the time she was 37 years old and it became the new masters W35 world record.', ' While her record would last for four years before it was surpassed by Heike Drechsler, it remains the exact age 37 world record.', ' It is the only exact age record between 17 and 38 not held by the big three women of long jumping; Drechsler, Chistyakova and Jackie Joyner Kersee Four years later, Olenchenko added the W40 record.']], ['Galina Chistyakova', ['Galina Valentinovna Chistyakova (Russian: Галина Валентиновна Чистякова , Slovak: \"Galina Čisťaková\" ; born 26 July 1962) is a retired athlete who represented the Soviet Union and later Slovakia.', ' She is the current world record holder in the long jump, jumping 7.52 metres on 11 June 1988.', ' She is the 1988 Olympic bronze medallist and the 1989 World Indoor champion.', ' She is also a former world record holder (pre IAAF) in the triple jump with 14.52 metres in 1989.']], ['Kathy Bergen', ['Kathy Bergen (born December 24, 1939) is an American Masters athletics track and field athlete.', ' She is the current world record holder in the W70 100 meters and the high jump.', ' She also holds the Indoor World records for the W65 high jump, the W70 60 meters, 200 meters and high jump.', ' And she holds the American record for the W70 200 meters and the W65 high jump.', ' She is the oldest woman to break the 15 second barrier in the 100 meters and to break 32 seconds in the 200 meters.']], ['George Horine', ['George Leslie Horine (February 3, 1890 – November 28, 1948) was an American athlete who mainly competed in the high jump.', ' He is credited with developing a technique called a forerunner to the western roll, a technique he developed due to the layout of his backyard where he practiced which was considered \"backward\" at the time.', ' While on the track team at Stanford University, his technique was corrected to the more conventional jumping style of the time.', ' He equalled the NCAA record in the event at 6\\' 4\" as a sophomore.', ' His junior year, 1912, he reverted to his old style, improving to 6\\' 4\\xa03/4\" and then a world record 6\\' 6\\xa01/8\".', ' A few weeks later at the Olympic Trials, he improved again to jump 6\\' 7\" making him the first man to break the 2 m barrier.', ' It was the first high jump world record ratified by the IAAF.', ' He never improved upon his record, which stood for two years.']], ['Eddie "The Eagle" Edwards', ['Michael Edwards (born 5 December 1963), best known as \"Eddie the Eagle\", is a British skier who in 1988 became the first competitor since 1929 to represent Great Britain in Olympic ski jumping, finishing last in the 70 m and 90 m events.', ' He became the British ski jumping record holder, ninth in amateur speed skiing (106.8 mph ), and a stunt jumping world record holder for jumping over 6 buses.']], ['Pedro Pérez', ['Pedro Damián Pérez Dueñas (] ; born February 23, 1952 in Pinar del Río, Cuba) is a retired triple jumper from Cuba.', \" He set the world record in the men's triple jump event on August 5, 1971, jumping 17.40 metres, while still a 19-year-old Junior athlete, in the final of the Pan American Games.\", ' His mark was a centimeter improvement over the three-year-old record of Viktor Sanyeyev set as the last of 5 world record improvements during the 1968 Olympics emphasizing the advantage of jumping at altitude.', ' Cali, Colombia is also considered at altitude.', ' While Sanyeyev reclaimed the record at sea level in Sukhumi, the next record in succession by João Carlos de Oliveira was also set at altitude in Mexico City and lasted ten more years.']], ['Pine Mountain Jump', ['The Pine Mountain Ski Jump is one of the highest artificially created ski jumps in the world, located in Iron Mountain, Michigan, Dickinson County.', ' It is part of the Kiwanis Ski Club and hosts annual FIS Ski Jumping Continental Cup competitions.', ' \"Pine Mountain Slide is known throughout the world as one of the better jumping hills.\"', ' Annually in February, it \"hosts jumpers from around the world at the best tournament in the United States.\"', ' Top-rated foreign jumpers compete.', ' Currently (excluding ski flying hills) Pine Mountain holds the U.S. records for the longest jump in World Cup competition at 140m (459 feet), as well as the overall distance record at 143.5m (471 feet).', ' The facility also includes two smaller ski jumping hills that are built into the hill northwest of the large hill.', ' Attendance is about 20,000.']], ['The Jump', ['The Jump is a British television series that follows celebrities as they try to master various winter sports including skeleton, bobsleigh, snowskates, ski cross, and giant slalom.', ' Davina McCall and Alex Brooker presented the first series, with McCall returning for future series.', ' Brooker did not return for future series however.', ' Winter Olympic skier Graham Bell and skeleton gold medallist Amy Williams put the celebrities through training in the UK and Austria.', ' Britain\\'s first Olympic ski jumper, Eddie \"The Eagle\" Edwards, appears live on the show to demonstrate the ski jump.']], ['Cliff jumping', ['Cliff jumping is jumping off a cliff as a form of sport.', ' When done without equipment, it may be also known as tombstoning.', ' It forms part of the sport of coastal exploration or \"coasteering\".', ' When performed with a parachute, it is known as BASE jumping.', ' The world record for cliff jumping is currently held by Laso Schaller, with a jump of 58.8\\xa0m (193\\xa0ft).']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 4 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-09 17:53:41.667\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5aba549f554299232ef4a290', 'answer': 'a midtempo hip hop ballad', 'question': 'What kind of song did Alexander Grant produce for Eminem?', 'supporting_facts': [['Alex da Kid', 0], ['Alex da Kid', 1], ['Love the Way You Lie', 4]], 'context': [['Final Warning (song)', ['Final Warning is the second single released by the American recording artist Skylar Grey for her second studio album \"Don\\'t Look Down\".', ' The song was written by Alexander Grant and Grey, and produced by Grant.']], ['Smoke + Mirrors', ['Smoke + Mirrors is the second studio album by American rock band Imagine Dragons.', \" The album was recorded during 2014 at the band's home studio in Las Vegas, Nevada.\", \" Self-produced by members of the band along with English hip-hop producer Alexander Grant, known by his moniker Alex da Kid, the album was released by Interscope Records and Grant's KIDinaKORNER label on February 17, 2015, in the United States.\"]], ['Love the Way You Lie', ['\"Love the Way You Lie\" is a song recorded by the American rapper Eminem, featuring the Barbadian singer Rihanna, from Eminem\\'s seventh studio album \"Recovery\" (2010).', ' The singer and songwriter Skylar Grey wrote and recorded a demo of the song alongside the producer Alex da Kid when she felt she was in an abusive romantic relationship with the music industry.', ' Eminem wrote the verses and chose Rihanna to sing the chorus, resulting in a collaboration influenced by their past experiences in difficult relationships.', ' Recording sessions were held in Ferndale, Michigan, and Dublin, Ireland.', ' Backed by guitar, piano and violin, the track is a midtempo hip hop ballad with a pop refrain, sung by Rihanna, and describes two lovers who refuse to separate despite being in a dangerous love–hate relationship.']], ['Alexander Grant (Upper Canada politician)', ['Alexander Grant (20 May 1734 – 8 May 1813) was a Royal Navy officer, businessman, and politician in Upper Canada.', \" During his service with the Royal Navy Grant saw action in the Seven Years' War before becoming a naval superintendent.\", ' He then embarked on a career in the ship building industry before losing much of his wealth during the American Revolution.', ' Grant recovered, however, and rose to prominence in civil society, becoming the administrator of Upper Canada in 1805.']], ['Farewell (Rihanna song)', ['\"Farewell\" is a song by Barbadian recording artist Rihanna, from her sixth studio album \"Talk That Talk\" (2011).', ' The song was written by Ester Dean and Alexander Grant, with production helmed by Grant under his production name Alex da Kid.', ' Instrumentation consists of a piano.']], ['Alex da Kid', ['Alexander Grant (born 27 August 1982), professionally known as Alex da Kid, is a British music producer from Wood Green, London.', ' He has gained recognition for producing several hit singles for a plethora of artists in various music genres, such as Dr. Dre (\"I Need a Doctor\"), Nicki Minaj (\"Massive Attack\"), B.o.B (\"Airplanes\" featuring Hayley Williams), Eminem (\"Love the Way You Lie\" featuring Rihanna), Diddy (\"Coming Home\" with Dirty Money featuring Skylar Grey), Imagine Dragons (\"Radioactive\") and Cheryl (\"Under The Sun\").']], ['Bill Grant (curler)', ['William Alexander Grant (June 16, 1882 – April 16, 1942) was a Canadian curler.', ' He was the lead of the 1928 and 1929 Brier Champion teams (skipped by Gordon Hudson), representing Manitoba.', ' Grant was a 1975 inductee to the Canadian Curling Hall of Fame.', ' He died suddenly in 1942 while attending a curling meeting at the Fort Rouge Curling Club.']], ['Make the World Move', ['\"Make the World Move\" is a song recorded by American singer Christina Aguilera for her seventh studio album, \"Lotus\" (2012).', ' It features guest vocals from Cee Lo Green.', ' The song was written by Alexander Grant, Mike Del Rio, Candice Pillay, Jayson DeZuzio, Dwayne Abernathy and Armando Trovajoli.', ' Musically, the track is an up–tempo inspirational song, which combines dance, R&B and soul genres.', ' Lyrically, it is a positive attitude song which features horns and synthesizers as part of its instrumentation.']], ['Charles William Grant, 5th Baron de Longueuil', ['Charles William Grant was born in 1782.', ' He was the son of Captain David Alexander Grant and Marie-Charles-Joseph Le Moyne, Baronne de Longueuil.', ' He married Caroline Coffin, daughter of General John Coffin and Anne Mathews, in 1813.', ' He became a member of the Legislative Council of Lower Canada.', ' He succeeded to the title of Baron de Longueuil on 17 January 1841.', ' He died on 5 July 1848 at his residence of Aylwing House in Kingston.']], ['Sir Alexander Grant, 5th Baronet', ['Sir Alexander Grant, 5th Baronet (1 July 1705 - 1 August 1772) was prominent Scottish slave trader, active in the City of London in the mid eighteenth century.', ' As part of Grant, Oswald and Co., he owned Bunce Island in Sierra Leone.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 4 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "\u001b[32m2025-12-09 17:53:41.698\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5add2eab5542992ae4cec4da', 'answer': 'Hong Kong', 'question': 'What is the nationality of the actor who costarred with Joe Chen and Jia Nailiang in \"Destined to Love You\"?', 'supporting_facts': [['Destined to Love You', 0], ['Bosco Wong', 0]], 'context': [['The World (film)', ['The World () is a 2004 Chinese film written and directed by Jia Zhangke.', ' Starring Jia\\'s muse, Zhao Tao, as well as Chen Taisheng, \"The World\" was filmed on and around an actual theme park located in Beijing, Beijing World Park, which recreates world landmarks at reduced scales for Chinese tourists.', ' \"The World\" was Jia\\'s first to gain official approval from the Chinese government.', ' Additionally, it was the first of his films to take place outside of his home province of Shanxi.']], ['Fated to Love You (2008 TV series)', ['Fated to Love You (), also known as \"You\\'re My Destiny\", \"Sticky Note Girl\" or \"Destiny Love\", is a 2008 Taiwanese drama starring Joe Chen, Ethan Juan, Baron Chen and Bianca Bai.', ' The series was first broadcast in Taiwan on free-to-air Taiwan Television (TTV) (台視) from 16 March 2008 to 24 August 2008, every Sunday at 22:00 and cable TV Sanlih E-Television (三立電視) from 22 March 2008 to 30 August 2008, every Saturday at 21:00.', ' It was produced by Sanlih E-Television and directed by Chen Ming Zhang () with location filming in Taiwan, Hong Kong and Shanghai.']], ['Jia Nailiang', ['Jia Nailiang (; born April 12, 1984 in Harbin, Heilongjiang) is a Chinese actor.']], ['Destined to Love You', ['Destined to Love You (Chinese: 偏偏喜欢你) is a 2015 Chinese television series created by Tong Hua and starring Joe Chen, Jia Nailiang and Bosco Wong with a special appearance by Zheng Shuang.', ' It aired on Hunan TV from 16 June to 11 July 2015.']], ['Bosco Wong', ['Bosco Wong Chung-chak (, born 13 December 1980) is a Hong Kong actor under TVB management and singer under East Asia Music.']], ['Quitting', ['Quitting () is a 2001 Chinese drama film directed by Zhang Yang, starring and based on the true life story of Jia Hongsheng.', ' Jia, an actor and former drug addict, battled his addiction to marijuana and heroin for five years from 1992 to 1997.', \" All members of the cast, from Jia and Jia's family members right down to the doctors and patients at a mental institute Jia was admitted to, are real people playing themselves.\", ' The film premiered at the Venice Film Festival on 4 September 2001 and clinched the NETPAC Award.']], ['Ying Ye 3 Jia 1', ['Ying Ye 3 Jia 1 (樱野3加1), also known as Sakurano in the Philippines, is a Taiwanese drama that airs Sunday on TTV/SETTV.', ' This drama brings back Ming Dao and Joe Chen Qiao En.']], ['High Flying Songs of Tang Dynasty', ['High Flying Songs of Tang Dynasty, also known as Da Tang Ge Fei, and originally known in Chinese as 大唐歌妃, is a Chinese television series based on the romance between the Tang dynasty singer-dancer Xu Hezi (许合子) and her lover Yin Menghe (尹梦荷), as well as a fictitious account of their involvement in the events in the reign of Emperor Xuanzong.', ' Starring Ma Su and Jia Nailiang as the couple, the series was first aired on CCTV-8 in mainland China on 20 September 2003.']], ['Dad is Back', [\"Dad is Back () is a Chinese reality-variety show that airs on ZRTG's Zhejiang Television, starring former Taiwanese boy band Fahrenheit member Wu Chun, film producer and president of Huayi Brothers film production company Zhong Lei Wang, actor Jia Nailiang, and former national gymnast Li Xiapeng.\", ' The show began airing on April 24, 2014, Thursday nights at 10:00 PM Beijing Time with 12 episodes total.']], ['Once Upon a Time in the Northeast', ['Once Upon a Time in the Northeast is a 2017 Chinese action comedy film directed by Guo Dalei and starring Jia Nailiang, Ma Li, Wang Xun, Liang Chao, Yu Yang, Qu Jingjing, Eric Tsang and Chin Shih-chieh.', ' It was released in China on 3 February 2017.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 4 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 87%|████████▋ | 437/500 [03:31<01:12, 1.15s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-09 17:53:41.732\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a8a09ff5542992d82986e6b', 'answer': '7 February 14786', 'question': 'When did the husband of William Roper bornj who was also the father of Margaret Roper?', 'supporting_facts': [['Portrait Miniature of Margaret Roper', 0], ['Portrait Miniature of Margaret Roper', 1], ['Thomas More', 0]], 'context': [['Thomas More', ['Sir Thomas More ( ; 7 February 14786 July 1535), venerated by Roman Catholics as Saint Thomas More, was an English lawyer, social philosopher, author, statesman and noted Renaissance humanist.', ' He was also a councillor to Henry VIII, and Lord High Chancellor of England from October 1529 to 16 May 1532.', ' He wrote \"Utopia\", published in 1516, about the political system of an imaginary ideal island nation.']], ['Chris Roper', ['Christopher George William Roper (born 20 May 1991) is an English cricketer.', ' Roper is a right-handed batsman who bowls right-arm fast medium pace.', ' He was born in Bristol.']], ['William Roper', ['William Roper (c. 1496 – 4 January 1578) was an English lawyer and member of Parliament.', ' The son of a Kentish gentleman, he married Margaret, daughter of Sir Thomas More.', ' He wrote a highly regarded biography of his father-in-law.']], ['Robert William Roper House', ['The Robert William Roper House is a historic house at 9 East Battery in Charleston, South Carolina.', ' It was built on land purchased by Robert W. Roper, a prominent cotton planter, in May 1838.', ' The house is an outstanding example of early 19th Century Greek Revival architecture, built on a monumental scale.', ' Although there are now two houses between the Roper House and White Point Garden to the south, at the time of its construction nothing stood between the house and the harbor beyond.', ' \"It is said that Mr. Roper intended his showcase home to be the first residence seen by visitors approaching Charleston from the sea.\"']], ['Thomas More Catholic School, Purley', ['Thomas More Catholic School is a Roman Catholic secondary school and sixth form, located in the Purley area of the London Borough of Croydon, England.', ' The Margaret Roper Primary School is located adjacent to Thomas More Catholic School.']], ['Tudor Barn, Eltham', ['The Tudor Barn is a large brick barn in Eltham in the Royal Borough of Greenwich.', ' It was built in 1525 by William Roper.', ' The Ropers lived next door in a manor house in the center of a moat for several years.', ' William married Margaret More, the daughter of Thomas More, who at the time was the lord chancellor to Henry VIII.', ' It is a Grade II* listed building (as Well Hall Art Gallery).']], ['Bill Roper (American football)', ['William Winston \"Bill\" Roper (August 22, 1880 – December 10, 1933) was an American football, basketball, and baseball player and coach.', ' He served as the head football coach at the Virginia Military Institute (1903–1904), Princeton University (1906–1908, 1910–1911, 1919–1930), the University of Missouri (1909), and Swarthmore College (1915–1916), compiling a career college football record of 112–38–18.', \" Roper's Princeton Tigers football teams of 1906, 1911, 1920, and 1922 have been recognized as national champions.\", ' His 89 wins are the most of any coach in the history of the program.', ' Roper was also the head basketball coach at Princeton for one season in 1902–03, tallying a mark of 8–7.', ' Roper played football as an end, basketball, and baseball as an outfielder at Princeton, from which he graduated in 1902.', ' He was inducted into the College Football Hall of Fame as a coach in 1951.']], ['Portrait Miniature of Margaret Roper', ['Portrait Miniature of Margaret Roper is a painting by the German artist and printmaker Hans Holbein the Younger created between 1535–36, and today held in the Metropolitan Museum of Art in New York.', ' Margaret Roper (1505–44) was the eldest child of Sir Thomas More and wife of the English biographer William Roper.', ' It is the second and less well known of two portraits of Roper painted by Holbein.', ' The first, \"Portrait of an English Woman\", is generally believed to show Roper but may depict another unknown lady of the English court.', \" The New York work was painted during the artist's second visit to London, likely in the mid-1530s.\"]], ['Russell Hill, Croydon', ['Russell Hill is an area in the London Borough of Croydon, located to the north-west of Purley.', \" It is named after former British Prime Minister John Russell, 1st Earl Russell who was President of the Warehousemen, Clerks and Drapers School which was built here in 1886; prior to this the locality was known as Beggar's Thorn or Beggar's Bush.\", ' The area is now home to Margaret Roper Catholic Primary School and Thomas More Catholic School.']], ['Margaret Roper', ['Margaret Roper (\"née\" More) (1505–1544) was an English writer and translator, and one of the most learned women of sixteenth-century England.', ' She was the daughter of Sir Thomas More and Jane Colt, who probably died in childbirth.', ' Margaret, or \"Meg\" as her father called her, was a frequent visitor during More\\'s imprisonment in the Tower of London.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 4 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "\u001b[32m2025-12-09 17:53:41.789\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5ab488bc5542990594ba9c55', 'answer': 'Iranian-American', 'question': 'Insomniac Events is part of a partnership with a club founded by an investor of what heritage?', 'supporting_facts': [['Insomniac Events', 4], ['Sam Nazarian', 0]], 'context': [['Together as One (festival)', ['Together as One was an electronic music festival.', \" It was held on New Year's Eve in Los Angeles.\", ' It was a joint production by promoters Go Ventures and Insomniac Events through 2010, but is now promoted solely by Go Ventures.', ' Taking place in downtown Los Angeles, Together As One attracts audiences of over 40,000 dance music enthusiasts each year.']], ['Escape Halloween', ['Escape Halloween is an electronic music festival held in Southern California around Halloween.', ' It is one of Insomniac Events music festivals running annually since 2011.', ' There are Halloween walk-through mazes, themed stages, and costumed performers.', ' Genres include EDM, house, dance, electro house, drum and bass, techno, dance-punk, hardstyle, dubstep, trance, and more.', \" Previous hosted stages include: Audiotistic, Bassrush, Richie Hawtin's ENTER., and Laidback Luke's Super You & Me.\", ' The event was originally named Escape from Wonderland, but was later changed to Escape Halloween with changing themes throughout each annual festival.', ' Themes for the event are all based around horror and range from things such as Escape from Wonderland (2011) and Escape Psycho Circus (2015).']], ['Union, University & Schools Club', ['Union, University & Schools Club is a private, social club founded in 1857.', ' and based in Sydney at 25 Bent Street.', ' The Club was formed by a merger between the Union Club and the University & Schools Club in January 2007.', ' Members must be nominated and seconded and the annual membership fee is only disclosed to potential members.', ' The Club has reciprocal relationships with other like minded clubs around the world, including the Melbourne Club, the Alexandra Club in Melbourne, the Turf Club, the Garrick Club and the Athenaeum Club, London, the Hong Kong Club, the Jonathan Club in Los Angeles and the Metropolitan Club and the Lotos Club in New York.']], ['Electric Forest Festival', ['Electric Forest Festival is an eight-day, two-weekend, multi-genre event with a focus on electronic and jam band genres, held in Rothbury, Michigan, at the Double JJ Resort.', ' The original event was called Rothbury Festival, debuted in 2008, and focused on jam bands and rock bands.', ' The event was not held in 2010.', ' Electric Forest, which debuted in 2011, is co-produced by Madison House Presents and Insomniac Events.', ' The 2015 event drew an estimated 45,000 attendees.']], ['Sprite Car Club of Australia', ['The Sprite Car Club of Australia is a club founded in 1960 for owners and enthusiasts of Austin-Healey Sprites and MG Midget cars.', '[1] The club has social events and sporting programs for amateur racers.']], ['Insomniac Events', ['Insomniac Events, founded by Pasquale Rotella, is an American tour promoter focusing primarily on electronic dance music events.', ' It organizes a number of major dance music festivals, including its flagship Electric Daisy Carnival, along with other events such as Beyond Wonderland, Nocturnal Wonderland and Escape From Wonderland.', ' It jointly organized the Together as One festival with rival promoter Go Ventures prior to 2011.', ' Insomniac also organizes the \"EDMBiz\" conference (an industry event that first took place in 2012 to coincide with EDC Las Vegas, in a similar fashion to the Winter Music Conference and the Ultra Music Festival).', ' Insomniac is involved in the operation of three Los Angeles nightclubs—Create (in partnership with SBE, built on the site of the former Vanguard Hollywood), Exchange L.A. and the underground warehouse Factory 93, located at 1756 Naud Street.', ' Insomniac also organizes drum and bass and dubstep-oriented events under the brand Bassrush, hardstyle events under the brand Basscon and trance festivals under the brand Dreamstate.']], ['Todd Mission, Texas', ['Todd Mission is a city in Grimes County, Texas, United States.', ' It lies on Farm Road 1774, 50 mi northwest of Houston.', ' The population was 107 as of the 2010 census, down from 146 at the 2000 census.', ' The city is home to the Texas Renaissance Festival and Middlelands Music Festival by Insomniac Events.']], ['White Wonderland', ['White Wonderland is an electronic music festival co-organized by Insomniac Events and fellow club promoter Giant.', \" The event was first held for New Year's Eve in 2011, following the announcement that Insomniac had pulled out of co-organizing the New Year's Eve festival Together as One due to conflicts with its fellow organizer Go Ventures.\"]], ['Beyond Wonderland', ['Beyond Wonderland is an electronic dance festival organized by Insomniac Events.', ' The event has been held in various locations across the west coast including Seattle, San Bernardino, and Mountain View spanning either one or two days.']], ['Sam Nazarian', ['Sam Nazarian (born 1975) is an Iranian-American businessman, investor and philanthropist.', ' He is the Founder, Chairman and CEO of SBE Entertainment Group.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 4 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-09 17:53:41.829\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a7d3f895542995f4f402248', 'answer': 'mountain Banshee', 'question': 'Pandora- The World of Avatar includes a ride that allows guest to take flight on what animal?', 'supporting_facts': [['Pandora – The World of Avatar', 2], ['Avatar Flight of Passage', 1]], 'context': [['Take Flight, LLC', ['Take Flight, LLC is a clothing brand founded in 2008 in Portland, Oregon, United States that makes custom apparel for fans and practitioners of parkour all around the world.']], ['Fictional universe of Avatar', ['In the 2009 science fiction film \"Avatar\", director James Cameron conceived a fictional universe in which humans seek to mine unobtanium on the fictional exoplanetary moon, Pandora.', \" The Earth-like moon is inhabited by a sapient indigenous humanoid species called the Na'vi, and varied fauna and flora.\", \" Resources Development Administration (RDA) scientists, administrators, recruits, support, and security personnel travel to Pandora in the 22nd century to discover this lush world, which is inhabited by many lifeforms including the human-like Na'vi.\", ' The clan with which the humans have contact in the film \"[lives] in a giant tree that sits on a vast store of a mineral called unobtanium, which humans want as an energy supply.\"']], ['Feral chicken', ['Feral chickens are derived from domestic chickens (\"Gallus gallus domesticus\") who have returned to the wild.', ' Like the red junglefowl (the closest wild relative of domestic chickens), feral chickens will take flight and roost in tall trees and bushes in order to avoid predators at night.']], ['Smoke (donkey)', ['Smoke, also known as Smoke the Donkey, became a therapy animal for the United States Marine Corps during the Iraq War.', ' Smoke lived on Camp Taqaddum in Iraq from 2008 to 2009 among the Marines of the 1st Marine Logistics Group who were deployed there.', ' In 2011, Smoke traveled half way around the world to the United States, the only Donkey to make such a journey.', ' The process to relocate Smoke from Iraq to the United States required senior level diplomatic coordination by multiple countries, and the assistance of the Society for the Prevention of Cruelty to Animals.', ' Once in the United States, Smoke lived at Take Flight Farms in Omaha, Nebraska.']], [\"Na'vi River Journey\", ['Na\\'vi River Journey is a dark ride attraction at Disney\\'s Animal Kingdom\\'s Pandora – The World of \"Avatar\".', ' The ride takes guests through the Kasvapan River of Pandora from the 2009 film \"Avatar\", showcasing native animals and bioluminescent flora, with inclusion of Audio-animatronics.']], ['Listen to the Crows as They Take Flight', ['Listen To The Crows As They Take Flight is the fourth album by Kid Dakota.', ' It was released on October 11, 2011, by Graveface Records.']], ['Pandora – The World of Avatar', ['Pandora – The World of \"Avatar\" is a themed area inspired by James Cameron\\'s \"Avatar\", located within Disney\\'s Animal Kingdom theme park at the Walt Disney World Resort in Bay Lake, Florida, near Orlando.', ' Set a generation after the events of the \"Avatar\" films, the area is based upon the fictional exoplanetary moon, Pandora, and features Pandora\\'s floating mountains, alien wildlife, and bioluminescent plants.', ' Spanning 12 acres , Pandora – The World of \"Avatar\" includes two major attractions, \"Avatar\" Flight of Passage and Na\\'vi River Journey, as well as retail and dining outlets.']], ['Avatar Flight of Passage', ['\"Avatar\" Flight of Passage is a 3D augmented reality flying simulator attraction within Pandora – The World of \"Avatar\" at Disney\\'s Animal Kingdom which opened on May 27, 2017.', ' The attraction allows guests to take flight on a mountain Banshee and soar across the landscape of Pandora.']], ['Dreams Take Flight', ['The Dreams Take Flight program was created by a group of Air Canada employees to give a trip of a lifetime to Disney World for a day for children with special needs and/or the siblings of children with special needs.', ' It has been in operation since 1989.']], ['Kid Dakota', ['Kid Dakota is the musical moniker of Darren Jackson.', ' He started performing as \"Kid Dakota and the Tumbleweeds\" in 1998 while living in Providence, Rhode Island.', ' The name was chosen in homage to his home state of South Dakota and also as a parody of Kid Rock.', ' In the summer of 1999, Darren recorded the five songs that would appear on the So Pretty ep with long-time friend and producer, Alex Oana, at City Cabin (formerly Blackberry Way).', ' Darren moved to Minneapolis, Minnesota that winter and self-released the So Pretty ep in the spring of 2000.', \" The ep caught the attention of Alan Sparhawk, singer and guitarist for the seminal slow-core band, Low (band) and he offered to release the ep on his label, Chairkickers' Union under the condition that it be expanded into a full-length lp.\", ' The LP version of \"So Pretty\" was released in the spring of 2002 with three additional songs.', ' In 2004 his second album, \"The West is the Future\" was also released by Chairkickers.', ' It was recorded live at Seedy Underbelly in Minneapolis, MN by Alex Oana and featured Zak Sally, the bassist from Low. \"', 'A Winner\\'s Shadow,\" was released on March 11, 2008 on Graveface Records.', ' His new album, \"\\'Listen to the Crows as They Take Flight\" was released by Graveface in October 2011.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 4 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 88%|████████▊ | 440/500 [03:31<00:48, 1.24it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-09 17:53:41.892\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a778b1e55429949eeb29ee3', 'answer': 'Nashville Tribute Band', 'question': 'What band is a tribute to a band originally known as the Grizzly River Boys?', 'supporting_facts': [['Nashville Tribute Band', 0], ['Diamond Rio', 0], ['Diamond Rio', 1]], 'context': [['Tony Hiller', ['Anthony Toby \"Tony\" Hiller (born 30 July 1927) is a British songwriter.', ' He began his musical career as a member of the song and dance duo The Hiller Brothers, sharing the stage with his brother Irving.', \" The Hiller Brothers appeared with many artists of the time including Alma Cogan, Tommy Cooper, Val Doonican, Matt Monro, The Shadows, Bernard Manning, Kathy Kirby, Roger Whittaker, Rip Taylor, Gene Vincent, Lance Percival, Tessie O'Shea, Frank Ifield, Deep River Boys, The Dallas Boys, Clark Brothers, Paul Melba and Ray Burns.\"]], ['Grizzly River Run', ['Grizzly River Run is located at Disney California Adventure at the Disneyland Resort in Anaheim, California.', \" It is similar to Kali River Rapids in Disney's Animal Kingdom but distinctive as the rafts are engineered to spin as they descend chutes.\", \" The attraction's name comes from Grizzly Peak, the bear shaped mountain that the rapids flow around.\", ' It was designed by Walt Disney Imagineering and constructed by Intamin.']], ['Diamond Rio', ['Diamond Rio is an American country and Christian country music band.', ' The band was founded in 1982 as an attraction for the Opryland USA theme park in Nashville, Tennessee, and was originally known as the Grizzly River Boys, then the Tennessee River Boys.', ' It was founded by Matt Davenport, Danny Gregg, and Ty Herndon, the last of whom became a solo artist in the mid-1990s.', ' After undergoing several membership changes in its initial years, the band has consisted of the same six members since 1989: Marty Roe (lead vocals, rhythm guitar), Gene Johnson (mandolin, guitar, fiddle, tenor vocals), Jimmy Olander (lead guitar, Dobro, banjo), Brian Prout (drums), Dan Truman (keyboards, organ, synthesizer), and Dana Williams (bass guitar, baritone vocals).']], ['Nashville Tribute Band', ['The Nashville Tribute Band is a Mormon music group founded by Jason Deere and Dan Truman, the pianist of the popular country group Diamond Rio.']], ['Little Eyes', [\"Little Eyes or Little Lize (Lil' Lize) is a folksong that is popular in Cornwall, England, UK, although it originated in America.\", ' It was written by Buford Abner of the Swannee River Boys in the late 1940s or early 1950s.', ' The first known recording is from the 1950s by an American harmony group called the Delta Rhythm Boys.']], ['Richard Renaldi', ['Richard Renaldi (born 1968) is an American portrait photographer.', ' His four main books each contain portraits of people Renaldi met in public, and some landscapes, made over numerous years with an 8×10 large format view camera.', ' Renaldi\\'s books are: \"Figure and Ground\" (2006) – various people throughout the USA; \"Fall River Boys\" (2009) – young men (and some women) growing up in the post-industrial city of Fall River, Massachusetts; \"Touching Strangers\" (2014) – strangers posed by Renaldi physically touching in some way, made all over the USA; and \"Manhattan Sunday\" (2016) – LGBT people photographed between midnight and 10 am on Sundays mainly on the streets of Manhattan having left nightclubs.']], ['The Hillmen (album)', ['The Hillmen is a studio album by The Hillmen, a southern California bluegrass band originally known as The Golden State Boys.', ' The Hillmen consisted of Chris Hillman (later of The Byrds, The Flying Burrito Brothers, Manassas and The Desert Rose Band) on mandolin, country singer/songwriter Vern Gosdin on guitar and lead vocals, his brother Rex Gosdin on double bass, and Don Parmley (later of the Bluegrass Cardinals) on banjo.']], ['Buford Abner', ['James Buford Abner (November 10, 1917 - November 19, 2011) was an American songwriter, musician and singer who worked during the early days of country music, working in both secular and gospel country music genres.', ' With his brother Merle Abner, his uncle Stacy Abner, George Hughes and Billy Carrier, he was a member of the Swanee River Boys.', ' He was inducted into the Southern Gospel Music Association Hall of Fame in 2002 and the Atlanta Country Music Hall of Fame as a member of the Swanee River Boys.']], ['Darryl Braxton', ['Darryl \"Brax\" Braxton is a fictional character from the Australian soap opera \"Home and Away\", played by Steve Peacocke.', ' He made his first screen appearance during the episode broadcast on 16 February 2011.', ' The character was created and introduced along with his two brothers; Heath (Dan Ewing) and half-brother, Casey (Lincoln Younes).', ' The trio were nicknamed the River Boys and were inspired by the real life Bra Boys group.', ' When Peacocke learnt about the role of Brax, he initially thought he would not suit the part as he is from the country.', ' However, after learning more about the character, Peacocke successfully auditioned for the role.', \" Peacocke's departure was announced on 1 February 2015 and Brax made a temporary exit on 10 June 2015, before returning on 9 December.\", ' He made his final appearance on 7 June 2016.']], ['Casey Braxton', ['Casey Braxton is a fictional character from the Australian Channel Seven soap opera \"Home and Away\", played by Lincoln Younes.', ' Casey made his first on-screen appearance on 17 February 2011.', ' Younes was about to go travelling when he auditioned for the role of Casey.', ' He changed his plans upon winning the role.', ' In late 2010 the Seven Network began airing trailers for a new trio of characters known as \"The River Boys\".', ' The trio consist of Casey and his older half-brothers Darryl (Steve Peacocke) and Heath Braxton (Dan Ewing).', ' The River Boys arrive in Summer Bay from neighbouring town Mangrove River.', ' Casey is characterised as being a \"modern day \"Rebel Without a Cause\"\"; who is intelligent and unsure about what he wants out of life.', ' Younes has described him as the \"epitome of teenage angst\".', \" The River Boys cause trouble in Summer Bay and producers were inspired by Koby Abberton's Bra Boys in the creation process.\", ' Casey is portrayed as wanting to distance himself from their bad reputation; but his anger issues often mar his attempts.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 4 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "\u001b[32m2025-12-09 17:53:42.213\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 12 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:53:45.904\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 8 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 89%|████████▊ | 443/500 [03:35<00:55, 1.03it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:53:46.083\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 17 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 89%|████████▉ | 445/500 [03:36<00:47, 1.17it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:53:53.877\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 9 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:53:53.883\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 9 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 89%|████████▉ | 447/500 [03:43<01:20, 1.51s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:53:53.922\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 9 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:53:53.933\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 12 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:53:53.970\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 12 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:53:53.983\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 17 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:53:54.142\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 12 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 90%|█████████ | 452/500 [03:43<00:39, 1.20it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:53:54.265\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 17 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:53:54.383\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 11 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 91%|█████████ | 454/500 [03:44<00:31, 1.47it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:54:02.866\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 9 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:54:02.924\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 9 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 91%|█████████ | 456/500 [03:52<01:06, 1.51s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:54:03.229\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 8 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 91%|█████████▏| 457/500 [03:52<00:57, 1.34s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:54:03.745\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 8 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 92%|█████████▏| 458/500 [03:53<00:50, 1.20s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:54:05.359\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 6 seconds. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 92%|█████████▏| 460/500 [03:55<00:41, 1.04s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:54:11.139\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 92%|█████████▏| 461/500 [04:00<01:22, 2.12s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:54:11.365\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mevoagentx.evaluators.evaluator\u001b[0m:\u001b[36m_evaluate_single_example\u001b[0m:\u001b[36m205\u001b[0m - \u001b[33m\u001b[1mError evaluating example and set the metrics to None:\n", "Example: {'_id': '5a7782b95542995d8318117f', 'answer': 'Travelers', 'question': 'What insurance company sponsored the Curling Skins Game in 2014?', 'supporting_facts': [['2014 Travelers All-Star Curling Skins Game', 0], ['The Travelers Companies', 0]], 'context': [['The Travelers Companies', ['The Travelers Companies, Inc. is an American insurance company.', ' It is the second largest writer of U.S. commercial property casualty insurance and the third largest writer of U.S. personal insurance through independent agents.', ' Travelers is incorporated in Minnesota, with headquarters in New York City and its largest office in Hartford, Connecticut.', ' Travelers also maintains a large office in St. Paul, Minnesota.', ' It has been a component of the Dow Jones Industrial Average since June 8, 2009.']], [\"2017 Pinty's All-Star Curling Skins Game\", [\"The 2017 Pinty's All-Star Curling Skins Game was held from February 3 to 5 at The Fenlands Banff Recreation Centre in Banff, Alberta.\"]], ['2007 Casino Rama Curling Skins Game', ['The 2007 Casino Rama Curling Skins Game on TSN was held on December 8th and 9th at the Casino Rama Entertainment Centre in Rama, Ontario.', ' It was the first TSN Skins Game put on since it was put on hiatus in 2004.', ' The total purse for the event was CAD$100,000.']], ['2010 Casino Rama Curling Skins Game', ['The 2010 Casino Rama Curling Skins Game on TSN was held on January 16th and 17th at the Casino Rama Entertainment Centre in Rama, Ontario.', ' The total purse for the event was CAD$100,000.']], ['2013 The Dominion All-Star Curling Skins Game', ['The 2013 Dominion All-Star Curling Skins Game was held from January 19 to 20 at the Casino Rama Entertainment Centre in Rama, Ontario.', ' The total purse for the event was CAD$100,000.']], ['2012 Casino Rama Curling Skins Game', ['The 2012 Casino Rama Curling Skins Game on TSN was held on January 7 and 8 at the Casino Rama Entertainment Centre in Rama, Ontario.', ' The total purse for the event was CAD$75,000.']], ['2009 Casino Rama Curling Skins Game', ['The 2009 Casino Rama Curling Skins Game on TSN was held on January 10th and 11th at the Casino Rama Entertainment Centre in Rama, Ontario.', ' The total purse for the event was CAD$ 100,000.']], [\"2015 Pinty's All-Star Curling Skins Game\", [\"The 2015 Pinty's All-Star Curling Skins Game was held from January 16 to 18 at The Fenlands Banff Recreation Centre in Banff, Alberta.\"]], ['TSN Skins Game', ['The TSN Curling Skins Game is an annual curling bonspiel hosted by The Sports Network. \"', 'Skins\" curling had been developed as a way to make curling more interesting on TV during the time before the free guard zone rule was implemented.', ' The bonspiel was held annually from 1986 to 2004 before being revived as the Casino Rama Curling Skins Game in 2007.', \" In 2013, Dominion of Canada took over naming rights to the event, which also shifted into an all-star format featuring teams of top Canadian curling players, but the format reverted to the original format in 2015, when Pinty's acquired the naming rights to the event.\"]], ['2014 Travelers All-Star Curling Skins Game', ['The 2014 Travelers All-Star Curling Skins Game was held on January 11 and 12 at The Fenlands Banff Recreation Centre in Banff, Alberta.', ' The total purse for the event was CAD$100,000.']]], 'type': 'bridge', 'level': 'hard'}\n", "Error: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 92%|█████████▏| 462/500 [04:00<01:02, 1.64s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m2025-12-09 17:54:12.416\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 93%|█████████▎| 463/500 [04:02<00:54, 1.49s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "\u001b[32m2025-12-09 17:54:12.524\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mevoagentx.workflow.workflow\u001b[0m:\u001b[36masync_execute\u001b[0m:\u001b[36m104\u001b[0m - \u001b[31m\u001b[1mAn Error occurs when executing the workflow: Error during single_generate_async: litellm.RateLimitError: AzureException RateLimitError - Your requests to gpt-4o-mini for gpt-4o-mini in East US have exceeded the token rate limit for your current AIServices S0 pricing tier. This request was for ChatCompletions_Create under Azure OpenAI API version 2025-01-01-preview. Please retry after 1 second. To increase your default rate limit, visit: https://aka.ms/oai/quotaincrease.\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 93%|█████████▎| 464/500 [04:02<00:39, 1.11s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 93%|█████████▎| 465/500 [04:02<00:29, 1.17it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 94%|█████████▎| 468/500 [04:02<00:14, 2.20it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.2857142857142857, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 94%|█████████▍| 470/500 [04:02<00:08, 3.36it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 95%|█████████▍| 473/500 [04:03<00:04, 5.53it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.4, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 95%|█████████▌| 477/500 [04:03<00:02, 9.24it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.5, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 96%|█████████▌| 479/500 [04:03<00:02, 7.93it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.4444444444444445, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n", "metrics {'f1': 0.5263157894736842, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 96%|█████████▌| 481/500 [04:04<00:04, 4.06it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 97%|█████████▋| 483/500 [04:05<00:04, 4.15it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.47619047619047616, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.8, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 97%|█████████▋| 487/500 [04:05<00:01, 7.98it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.375, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 98%|█████████▊| 489/500 [04:05<00:01, 9.27it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 0.0}\n", "metrics {'f1': 0.1, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0.5714285714285715, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 98%|█████████▊| 491/500 [04:05<00:01, 8.64it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.28571428571428575, 'em': 0.0, 'acc': 0.0}\n", "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 99%|█████████▊| 493/500 [04:06<00:01, 5.83it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.33333333333333337, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 99%|█████████▉| 494/500 [04:07<00:01, 3.89it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.05555555555555556, 'em': 0.0, 'acc': 1.0}\n", "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 99%|█████████▉| 496/500 [04:07<00:00, 4.73it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.6666666666666666, 'em': 0.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 99%|█████████▉| 497/500 [04:07<00:00, 4.73it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 100%|█████████▉| 498/500 [04:08<00:00, 3.79it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 1.0, 'em': 1.0, 'acc': 1.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "Evaluating workflow: 100%|█████████▉| 499/500 [04:09<00:00, 2.13it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0.8, 'em': 0.0, 'acc': 0.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Evaluating workflow: 100%|██████████| 500/500 [04:11<00:00, 1.99it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "metrics {'f1': 0, 'em': 0.0, 'acc': 0.0}\n", "Evaluation metrics: {'f1': 0.51540291362048, 'em': 0.37553648068669526, 'acc': 0.5643776824034334}\n", "\u001b[32m2025-12-09 17:54:21.792\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.workflow.workflow_graph\u001b[0m:\u001b[36msave_module\u001b[0m:\u001b[36m1201\u001b[0m - \u001b[1mSaving SEWWorkFlowGraph to debug/optimized_sew_workflow_update_correct_round20_step20_gpt4omini_optzall.json\u001b[0m\n", "\u001b[32m2025-12-09 17:54:21.793\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mevoagentx.utils.utils\u001b[0m:\u001b[36mmake_parent_folder\u001b[0m:\u001b[36m19\u001b[0m - \u001b[1mcreating folder debug ...\u001b[0m\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "# obtain SEWOptimizer after having more roles\n", "optimizer = SEWOptimizer(\n", " graph=sew_graph, \n", " evaluator=evaluator, \n", " llm=llm, \n", " max_steps=20,\n", " eval_rounds=1, \n", " repr_scheme=\"python\", \n", " optimize_mode=\"all\", \n", " order=\"zero-order\",\n", " max_rounds=20,\n", ")\n", "\n", "# with suppress_logger_info():\n", "# metrics = optimizer.evaluate(dataset=humaneval, eval_mode=\"test\")\n", "# print(\"Evaluation metrics: \", metrics)\n", "\n", "# optimize the SEW workflow\n", "optimizer.optimize(dataset=benchmark)\n", "\n", "# evaluate the optimized SEW workflow\n", "with suppress_logger_info():\n", " metrics = optimizer.evaluate(dataset=benchmark, eval_mode=\"test\")\n", "print(\"Evaluation metrics: \", metrics)\n", "\n", "# save the optimized SEW workflow\n", "optimizer.save(\"debug/optimized_sew_workflow_update_correct_round20_step20_gpt4omini_optzall.json\")" ] }, { "cell_type": "code", "execution_count": 15, "id": "c5f272e3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'f1': 0.51540291362048, 'em': 0.37553648068669526, 'acc': 0.5643776824034334}" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "metrics" ] }, { "cell_type": "code", "execution_count": null, "id": "463adce5", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "947901ef", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "f50bf6b2", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "defc586a", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "902b3a5b", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.13" } }, "nbformat": 4, "nbformat_minor": 5 }