{ "cells": [ { "cell_type": "markdown", "id": "60d572c1-fb13-4c73-9e49-c489eb93f7cd", "metadata": {}, "source": [ "## Setup" ] }, { "cell_type": "code", "execution_count": 15, "id": "410e3cb5-758c-4127-a33f-6dd29c54b716", "metadata": {}, "outputs": [], "source": [ "from langchain_benchmarks import registry\n", "from langchain_core.language_models.llms import LLM\n", "import requests\n", "from strenum import StrEnum\n", "from pydantic import BaseModel, Field\n", "from rich import print as rprint\n", "from typing import Optional,List, Dict, Any\n", "import bittensor as bt\n", "import os" ] }, { "cell_type": "code", "execution_count": 2, "id": "f7720829-66ae-4796-9071-e230626dac19", "metadata": {}, "outputs": [], "source": [ "subnet = bt.metagraph(netuid=20, network=\"finney\")\n", "\n", "# Wallet and validator setup\n", "WALLET_NAME = \"\" # TODO, put your coldkey\n", "HOTKEY_NAME = \"\" # TODO, put your hotkey\n", "os.environ[\"LANGCHAIN_API_KEY\"] = \"\" # TODO put your LangChain API Key here if you wish to dig through lang smith results\n", "vali_wallet = bt.wallet(name=WALLET_NAME, hotkey=HOTKEY_NAME)\n", "vali_dendrite = bt.dendrite(wallet=vali_wallet)\n", "\n", "validator_uids = subnet.uids[((subnet.S>20000) & subnet.validator_permit)]\n", "miner_uids = subnet.uids[(subnet.S<=20000)]\n", "\n", "class Tool(BaseModel):\n", " name: str\n", " description: str\n", " arguments: Dict[str, Dict[str, Any]]\n", "\n", " def toJSON(self):\n", " return {\"name\": self.name, \"description\": self.description, \"arguments\": self.arguments}\n", " \n", " def to_dict(self):\n", " return self.dict()\n", "\n", "class ChatRole(StrEnum):\n", " ASSISTANT = \"assistant\"\n", " USER = \"user\"\n", " TOOL_CALL = \"tool call\"\n", " TOOL_RESPONSE = \"tool response\"\n", " \n", "class ChatMessage(BaseModel):\n", " \"\"\"A list of previous messages between the user and the model, meant to give the model conversational context for responding to the user's message.\"\"\"\n", "\n", " role: ChatRole = Field(\n", " title=\"One of the ChatRole's to identify who the message is coming from.\",\n", " )\n", " content: str | dict | list = Field( \n", " title=\"Contents of the chat message.\",\n", " )\n", "\n", " @classmethod\n", " def from_dict(cls, data: Dict[str, str]):\n", " \"\"\"Create a ChatMessage object from a dictionary.\"\"\"\n", " return cls(role=ChatRole(data['role']), content=data['content'])\n", " \n", " def to_dict(self) -> Dict[str, str]:\n", " return {\"role\": self.role, \"content\": self.content}\n", "\n", " def toJSON(self):\n", " return {\"role\": self.role, \"content\": self.content}\n", "\n", "class Conversation(BaseModel):\n", " messages: List[ChatMessage] = []\n", " \n", " @classmethod\n", " def from_list(cls, data_list: List[Dict[str, str]]):\n", " \"\"\"Create a Conversation object from a list of dictionaries.\"\"\"\n", " messages = [ChatMessage.from_dict(item) for item in data_list]\n", " return cls(messages=messages)\n", " \n", " def to_list(self):\n", " return [msg.to_dict() for msg in self.messages]\n", "\n", " def toJSON(self):\n", " return self.to_list()\n", " \n", "# the request protocol\n", "class QnATask(bt.Synapse):\n", " urls: List[str] = [] # not used right now - when enabled would allow users to pass in URLs for content\n", " datas: List[dict] = [] # used to pass in relevant context, could be a company knowledge base or a set of wikipedia pages\n", " tools: List[Tool] = [] # used to pass in tools to be leveraged in answering user query\n", " notes: str = \"No Notes\"\n", " prompt: str = \"\" # the query / prompt\n", " messages: List[ChatMessage] = []\n", " response: Optional[dict] = {}\n", " timeout: Optional[float] = 3.0\n", " miner_uids: Optional[List[int]] = [] # put our TOP miner into the network as the miner to query (if empty list, a random list of miners will be selected)\n", " \n", " def toJSON(self):\n", " return {\"prompt\": self.prompt, \n", " \"urls\": self.urls, \n", " \"datas\": self.datas, \n", " \"tools\": [t.toJSON() for t in self.tools],\n", " \"notes\": self.notes,\n", " \"messages\": self.messages.toJSON(),\n", " \"response\": self.response,\n", " \"miner_uids\": self.miner_uids,\n", " \"dendrite_process_time\": self.dendrite.process_time,\n", " \"dendrite_status_code\": self.dendrite.status_code,\n", " \"axon_status_code\": self.axon.status_code,}\n" ] }, { "cell_type": "code", "execution_count": 3, "id": "8be029cb-5ca9-4dc3-9d49-af93bae53b85", "metadata": {}, "outputs": [], "source": [ "import json\n", "import time\n", "from langchain_core.outputs.chat_generation import ChatGeneration \n", "from langchain.agents.output_parsers.tools import ToolAgentAction\n", "from langchain_core.messages.ai import AIMessageChunk\n", "from langchain_core.messages import AIMessage\n", "from langchain.schema.output import LLMResult\n", "\n", "from typing import Any, AsyncIterator, Dict, Iterator, List, Optional, Sequence, Union, Type, Callable, Literal\n", "from langchain_core.runnables import Runnable\n", "from langchain_core.pydantic_v1 import BaseModel, Field, SecretStr, root_validator\n", "from langchain_core.callbacks import (\n", " AsyncCallbackManagerForLLMRun,\n", " CallbackManagerForLLMRun,\n", ")\n", "from langchain_core.language_models import BaseChatModel, SimpleChatModel\n", "from langchain_core.messages import AIMessageChunk, BaseMessage, HumanMessage\n", "from langchain_core.outputs import ChatGeneration, ChatGenerationChunk, ChatResult\n", "from langchain_core.runnables import run_in_executor\n", "from langchain_core.tools import BaseTool\n", "from langchain_core.language_models import LanguageModelInput\n", "\n", "class CustomChatModelAdvanced(BaseChatModel):\n", " top_miner_uids = (-subnet.I).argsort()[:3].tolist()\n", " original_tools = []\n", " tools = []\n", "\n", " \n", " def bind_tools(\n", " self,\n", " tools: Sequence[Union[Dict[str, Any], Type[BaseModel], Callable, BaseTool]],\n", " *,\n", " tool_choice: Optional[\n", " Union[Dict[str, str], Literal[\"any\", \"auto\"], str]\n", " ] = None,\n", " **kwargs: Any,\n", " ) -> Runnable[LanguageModelInput, BaseMessage]:\n", "\n", "\n", " def build_arg(t):\n", " to_ret = {}\n", " for k,v in t.args.items():\n", " to_ret[k] = {'required': True, 'type': v['type'], 'description': v['title']}\n", "\n", " return to_ret\n", " \n", " self.tools = [Tool(name=t.name, description=t.description, arguments=build_arg(t)) for t in tools]\n", " \n", " self.original_tools = tools\n", " formatted_tools = tools\n", " if not tool_choice:\n", " pass\n", " elif isinstance(tool_choice, dict):\n", " kwargs[\"tool_choice\"] = tool_choice\n", " elif isinstance(tool_choice, str) and tool_choice in (\"any\", \"auto\"):\n", " kwargs[\"tool_choice\"] = {\"type\": tool_choice}\n", " elif isinstance(tool_choice, str):\n", " kwargs[\"tool_choice\"] = {\"type\": \"tool\", \"name\": tool_choice}\n", " else:\n", " raise ValueError(\n", " f\"Unrecognized 'tool_choice' type {tool_choice=}. Expected dict, \"\n", " f\"str, or None.\"\n", " )\n", " return self.bind(tools=formatted_tools, **kwargs)\n", "\n", " def _generate(\n", " self,\n", " messages: List[BaseMessage],\n", " stop: Optional[List[str]] = None,\n", " run_manager: Optional[CallbackManagerForLLMRun] = None,\n", " **kwargs: Any,\n", " ) -> ChatResult:\n", "\n", " def get_role(name):\n", " if name == \"SystemMessage\":\n", " return ChatRole.USER\n", "\n", " if name == \"HumanMessage\":\n", " return ChatRole.USER\n", "\n", " return ChatRole.USER\n", "\n", " resp = None\n", " try:\n", " \n", " task = QnATask(\n", " prompt=\"\",\n", " datas=[],\n", " urls=[],\n", " tools=self.tools,\n", " notes=\"\",\n", " messages=[ChatMessage(role=get_role(type(m).__name__), content=m.content) for m in messages]\n", " )\n", " \n", " responses = vali_dendrite.query(\n", " axons=[subnet.axons[uid] for uid in self.top_miner_uids],\n", " synapse=task,\n", " deserialize=False,\n", " timeout=60,\n", " )\n", " for test_resp in responses:\n", " try:\n", " if \"response\" in test_resp.response.keys():\n", " if self.tools:\n", " resp = json.loads(test_resp.response[\"response\"])\n", " for msg in resp:\n", " if 'role' in msg.keys() and msg['role'] == \"tool use\":\n", " # resp is probably good\n", " break\n", " else:\n", " resp = test_resp.response[\"response\"]\n", " if resp:\n", " break\n", " \n", " except Exception as e:\n", " print(\"SMALLER ERROR: \", e)\n", " print(test_resp)\n", " \n", " except Exception as e:\n", " print(\"BIGGER ERROR: \", e)\n", " \n", " if not resp:\n", " print(\"OMG BIG ERROR (NO RESP): \", responses)\n", " for respo in responses:\n", " print(respo.dendrite.status_code)\n", " print(respo.axon.status_code)\n", " \n", " ai_message_content = []\n", " if type(resp) == str:\n", " ai_message_content = resp\n", " else:\n", " for mesg in resp:\n", " new_msg = {'type': 'text', 'text': mesg['content']} # default for non tool calling messages\n", " if mesg['role'] == \"tool call\":\n", " new_msg['type'] = \"tool_call\"\n", " new_msg['text'] = None\n", " new_msg['name'] = mesg['content']['name']\n", " new_msg['input'] = mesg['content']['arguments']\n", " \n", " ai_message_content.append(new_msg)\n", "\n", " message = AIMessage(\n", " content=ai_message_content,\n", " additional_kwargs={}, # Used to add additional payload (e.g., function calling request)\n", " response_metadata={ # Use for response metadata\n", " \"time_in_seconds\": 3,\n", " },\n", " )\n", " generation = ChatGeneration(message=message)\n", " return ChatResult(generations=[generation])\n", "\n", " @property\n", " def _llm_type(self) -> str:\n", " return \"echoing-chat-model-advanced\"\n", "\n", " @property\n", " def _identifying_params(self) -> Dict[str, Any]:\n", " return {\n", " \"model_name\": \"WHATEVER\"\n", " }" ] }, { "cell_type": "code", "execution_count": 4, "id": "1ac3faab-5fcf-4b97-8080-9c0048f9debf", "metadata": {}, "outputs": [], "source": [ "from langchain_core.prompts import ChatPromptTemplate\n", "from langchain_benchmarks.tool_usage.agents import StandardAgentFactory\n", "from langchain_benchmarks import registry\n", "\n", "model = CustomChatModelAdvanced()\n", "\n", "prompt = ChatPromptTemplate.from_messages(\n", " [\n", " (\"system\", \"{instructions}\"), # Populated from task.instructions automatically\n", " (\n", " \"user\",\n", " \"{question}\",\n", " ), # Each evaluation example is associated with a question\n", " (\"placeholder\", \"{agent_scratchpad}\"), # Space for the agent to do work\n", " ]\n", ")\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "b96759c4-c4e7-4aa2-bf8c-7412a27d010b", "metadata": {}, "outputs": [], "source": [ "import datetime\n", "import uuid\n", "\n", "from langsmith.client import Client\n", "\n", "from langchain_benchmarks import (\n", " __version__,\n", " clone_public_dataset,\n", " model_registry,\n", " registry,\n", ")\n", "from langchain_benchmarks.rate_limiting import RateLimiter" ] }, { "cell_type": "markdown", "id": "d43c4640-6d69-4fc9-8cea-6d8867dbc88a", "metadata": {}, "source": [ "## Run Benchmark Tests" ] }, { "cell_type": "code", "execution_count": 6, "id": "ab2f7a9a-c1a5-4293-80a8-5d7263938d42", "metadata": {}, "outputs": [], "source": [ "benchmark_local = False\n", "benchmark_langsmith = True" ] }, { "cell_type": "markdown", "id": "b7c4f59e-14e6-42a3-8d30-03f40004cae6", "metadata": {}, "source": [ "### BenchMark Local" ] }, { "cell_type": "code", "execution_count": 7, "id": "fa55c784-bdee-414b-8596-d26353663f49", "metadata": {}, "outputs": [], "source": [ "import os\n", "from langchain_benchmarks.utils import run_without_langsmith\n", "\n", "tests = [\n", " (\"Mytest\", CustomChatModelAdvanced())\n", "]\n", "\n", "if benchmark_local:\n", " for task in registry.tasks:\n", " if task.type != \"ToolUsageTask\":\n", " continue\n", " \n", " dataset_name = task.name + f\" ({today})\"\n", " clone_public_dataset(task.dataset_id, dataset_name=dataset_name)\n", " \n", " for model_name, model in tests:\n", " print()\n", " print(f\"Benchmarking {task.name} with model: {model_name}\")\n", " if task.name in [\"Tool Usage - Relational Data\",\"Multiverse Math\"]:\n", " eval_config = task.get_eval_config(eval_llm=CustomChatModelAdvanced())\n", " else:\n", " eval_config = task.get_eval_config()\n", " \n", " agent_factory = StandardAgentFactory(\n", " task, model, prompt, rate_limiter=rate_limiter\n", " )\n", " \n", " test_run = run_without_langsmith(\n", " # This will clone the dataset locally if not already there\n", " path_or_token_id=task.dataset_id,\n", " llm_or_chain_factory=agent_factory,\n", " evaluation=eval_config,\n", " concurrency_level=1,\n", " verbose=True,\n", " )" ] }, { "cell_type": "markdown", "id": "c7915129-9dda-42c5-8380-17a6f5311151", "metadata": {}, "source": [ "### Benchmark with LangSmith" ] }, { "cell_type": "code", "execution_count": 13, "id": "638594aa-aeb3-49e0-802d-a9e19e4d529c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dataset Tool Usage - Typewriter (1 tool) (2024-07-08) already exists. Skipping.\n", "You can access the dataset at https://smith.langchain.com/o/9796c4da-021f-5ea2-ad5c-978e23367525/datasets/5bbaa6ff-969d-49fc-a319-060ec17edb3b.\n", "\n", "Benchmarking Tool Usage - Typewriter (1 tool) with model: Mytest\n", "View the evaluation results for project 'Mytest-Tool Usage - Typewriter (1 tool)-2024-07-08-e8c1ff482d3d4bffa6d7a11771058bfb' at:\n", "https://smith.langchain.com/o/9796c4da-021f-5ea2-ad5c-978e23367525/datasets/5bbaa6ff-969d-49fc-a319-060ec17edb3b/compare?selectedSessions=ed660500-22c5-47b1-84ee-af411768a46e\n", "\n", "View all tests for Dataset Tool Usage - Typewriter (1 tool) (2024-07-08) at:\n", "https://smith.langchain.com/o/9796c4da-021f-5ea2-ad5c-978e23367525/datasets/5bbaa6ff-969d-49fc-a319-060ec17edb3b\n", "[------------------------------------------------->] 20/20Dataset Tool Usage - Typewriter (26 tools) (2024-07-08) already exists. Skipping.\n", "You can access the dataset at https://smith.langchain.com/o/9796c4da-021f-5ea2-ad5c-978e23367525/datasets/641a2d0d-1a06-4572-8414-df42b6bc21d3.\n", "\n", "Benchmarking Tool Usage - Typewriter (26 tools) with model: Mytest\n", "View the evaluation results for project 'Mytest-Tool Usage - Typewriter (26 tools)-2024-07-08-e8c1ff482d3d4bffa6d7a11771058bfb' at:\n", "https://smith.langchain.com/o/9796c4da-021f-5ea2-ad5c-978e23367525/datasets/641a2d0d-1a06-4572-8414-df42b6bc21d3/compare?selectedSessions=da8fc9fb-fc21-4655-ac4e-10d287c05730\n", "\n", "View all tests for Dataset Tool Usage - Typewriter (26 tools) (2024-07-08) at:\n", "https://smith.langchain.com/o/9796c4da-021f-5ea2-ad5c-978e23367525/datasets/641a2d0d-1a06-4572-8414-df42b6bc21d3\n", "[------------------------------------------------->] 20/20Dataset Tool Usage - Relational Data (2024-07-08) already exists. Skipping.\n", "You can access the dataset at https://smith.langchain.com/o/9796c4da-021f-5ea2-ad5c-978e23367525/datasets/53ac9124-aa35-4e4f-9b3f-59cb46e353b6.\n", "\n", "Benchmarking Tool Usage - Relational Data with model: Mytest\n", "View the evaluation results for project 'Mytest-Tool Usage - Relational Data-2024-07-08-e8c1ff482d3d4bffa6d7a11771058bfb' at:\n", "https://smith.langchain.com/o/9796c4da-021f-5ea2-ad5c-978e23367525/datasets/53ac9124-aa35-4e4f-9b3f-59cb46e353b6/compare?selectedSessions=14218a79-7be1-405c-9265-fb165f5b2ace\n", "\n", "View all tests for Dataset Tool Usage - Relational Data (2024-07-08) at:\n", "https://smith.langchain.com/o/9796c4da-021f-5ea2-ad5c-978e23367525/datasets/53ac9124-aa35-4e4f-9b3f-59cb46e353b6\n", "[------------------------------------------------->] 21/21" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8f10ff0613e24754a9bf3df6611b9d73", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/20 [00:00] 20/20" ] } ], "source": [ "if benchmark_langsmith:\n", " \n", " from langsmith.client import Client\n", " experiment_id = uuid.uuid4().hex[:]\n", " client = Client() # Launch langsmith client for cloning datasets\n", " today = datetime.date.today().isoformat()\n", " \n", " # You can use an optional rate limiter to rate limit your requests!\n", " rate_limiter = RateLimiter(requests_per_second=1)\n", " \n", " for task in registry.tasks:\n", " if task.type != \"ToolUsageTask\":\n", " continue\n", " \n", " dataset_name = task.name + f\" ({today})\"\n", " clone_public_dataset(task.dataset_id, dataset_name=dataset_name)\n", " \n", " for model_name, model in tests:\n", " print()\n", " print(f\"Benchmarking {task.name} with model: {model_name}\")\n", " if task.name in [\"Tool Usage - Relational Data\",\"Multiverse Math\"]:\n", " eval_config = task.get_eval_config(eval_llm=CustomChatModelAdvanced())\n", " else:\n", " eval_config = task.get_eval_config() \n", " \n", " agent_factory = StandardAgentFactory(\n", " task, model, prompt, rate_limiter=rate_limiter\n", " )\n", " \n", " client.run_on_dataset(\n", " dataset_name=dataset_name,\n", " llm_or_chain_factory=agent_factory,\n", " evaluation=eval_config,\n", " verbose=False,\n", " project_name=f\"{model_name}-{task.name}-{today}-{experiment_id}\",\n", " concurrency_level=1,\n", " project_metadata={\n", " \"model\": model_name,\n", " \"id\": experiment_id,\n", " \"task\": task.name,\n", " \"date\": today,\n", " \"langchain_benchmarks_version\": __version__,\n", " },\n", " )" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 5 }