## Setup

In [15]:
from langchain_benchmarks import registry
from langchain_core.language_models.llms import LLM
import requests
from strenum import StrEnum
from pydantic import BaseModel, Field
from rich import print as rprint
from typing import Optional,List, Dict, Any
import bittensor as bt
import os

In [2]:
subnet = bt.metagraph(netuid=20, network="finney")

# Wallet and validator setup
WALLET_NAME = "" # TODO, put your coldkey
HOTKEY_NAME = "" # TODO, put your hotkey
os.environ["LANGCHAIN_API_KEY"] = "" # TODO put your LangChain API Key here if you wish to dig through lang smith results
vali_wallet =  bt.wallet(name=WALLET_NAME, hotkey=HOTKEY_NAME)
vali_dendrite = bt.dendrite(wallet=vali_wallet)

validator_uids = subnet.uids[((subnet.S>20000) & subnet.validator_permit)]
miner_uids = subnet.uids[(subnet.S<=20000)]

class Tool(BaseModel):
    name: str
    description: str
    arguments: Dict[str, Dict[str, Any]]

    def toJSON(self):
        return {"name": self.name, "description": self.description, "arguments": self.arguments}
    
    def to_dict(self):
        return self.dict()

class ChatRole(StrEnum):
    ASSISTANT = "assistant"
    USER = "user"
    TOOL_CALL = "tool call"
    TOOL_RESPONSE = "tool response"
    
class ChatMessage(BaseModel):
    """A list of previous messages between the user and the model, meant to give the model conversational context for responding to the user's message."""

    role: ChatRole = Field(
        title="One of the ChatRole's to identify who the message is coming from.",
    )
    content: str | dict | list = Field( 
        title="Contents of the chat message.",
    )

    @classmethod
    def from_dict(cls, data: Dict[str, str]):
        """Create a ChatMessage object from a dictionary."""
        return cls(role=ChatRole(data['role']), content=data['content'])
    
    def to_dict(self) -> Dict[str, str]:
        return {"role": self.role, "content": self.content}

    def toJSON(self):
        return {"role": self.role, "content": self.content}

class Conversation(BaseModel):
    messages: List[ChatMessage] = []
    
    @classmethod
    def from_list(cls, data_list: List[Dict[str, str]]):
        """Create a Conversation object from a list of dictionaries."""
        messages = [ChatMessage.from_dict(item) for item in data_list]
        return cls(messages=messages)
    
    def to_list(self):
        return [msg.to_dict() for msg in self.messages]

    def toJSON(self):
        return self.to_list()
        
# the request protocol
class QnATask(bt.Synapse):
    urls: List[str] = []   # not used right now - when enabled would allow users to pass in URLs for content
    datas: List[dict] = [] # used to pass in relevant context, could be a company knowledge base or a set of wikipedia pages
    tools: List[Tool] = [] # used to pass in tools to be leveraged in answering user query
    notes: str = "No Notes"
    prompt: str = ""       # the query / prompt
    messages: List[ChatMessage] = []
    response: Optional[dict] = {}
    timeout: Optional[float] = 3.0
    miner_uids: Optional[List[int]] = [] # put our TOP miner into the network as the miner to query (if empty list, a random list of miners will be selected)
    
    def toJSON(self):
        return {"prompt": self.prompt, 
                "urls": self.urls, 
                "datas": self.datas, 
                "tools": [t.toJSON() for t in self.tools],
                "notes": self.notes,
                "messages": self.messages.toJSON(),
                "response": self.response,
                "miner_uids": self.miner_uids,
                "dendrite_process_time": self.dendrite.process_time,
                "dendrite_status_code": self.dendrite.status_code,
                "axon_status_code": self.axon.status_code,}


In [3]:
import json
import time
from langchain_core.outputs.chat_generation import ChatGeneration 
from langchain.agents.output_parsers.tools import ToolAgentAction
from langchain_core.messages.ai import AIMessageChunk
from langchain_core.messages import AIMessage
from langchain.schema.output import LLMResult

from typing import Any, AsyncIterator, Dict, Iterator, List, Optional, Sequence, Union, Type, Callable, Literal
from langchain_core.runnables import Runnable
from langchain_core.pydantic_v1 import BaseModel, Field, SecretStr, root_validator
from langchain_core.callbacks import (
    AsyncCallbackManagerForLLMRun,
    CallbackManagerForLLMRun,
)
from langchain_core.language_models import BaseChatModel, SimpleChatModel
from langchain_core.messages import AIMessageChunk, BaseMessage, HumanMessage
from langchain_core.outputs import ChatGeneration, ChatGenerationChunk, ChatResult
from langchain_core.runnables import run_in_executor
from langchain_core.tools import BaseTool
from langchain_core.language_models import LanguageModelInput

class CustomChatModelAdvanced(BaseChatModel):
    top_miner_uids = (-subnet.I).argsort()[:3].tolist()
    original_tools = []
    tools = []

    
    def bind_tools(
            self,
            tools: Sequence[Union[Dict[str, Any], Type[BaseModel], Callable, BaseTool]],
            *,
            tool_choice: Optional[
                Union[Dict[str, str], Literal["any", "auto"], str]
            ] = None,
            **kwargs: Any,
        ) -> Runnable[LanguageModelInput, BaseMessage]:


            def build_arg(t):
                to_ret = {}
                for k,v in t.args.items():
                    to_ret[k] = {'required': True, 'type': v['type'], 'description': v['title']}

                return to_ret
                
            self.tools = [Tool(name=t.name, description=t.description, arguments=build_arg(t)) for t in tools]
        
            self.original_tools = tools
            formatted_tools = tools
            if not tool_choice:
                pass
            elif isinstance(tool_choice, dict):
                kwargs["tool_choice"] = tool_choice
            elif isinstance(tool_choice, str) and tool_choice in ("any", "auto"):
                kwargs["tool_choice"] = {"type": tool_choice}
            elif isinstance(tool_choice, str):
                kwargs["tool_choice"] = {"type": "tool", "name": tool_choice}
            else:
                raise ValueError(
                    f"Unrecognized 'tool_choice' type {tool_choice=}. Expected dict, "
                    f"str, or None."
                )
            return self.bind(tools=formatted_tools, **kwargs)

    def _generate(
        self,
        messages: List[BaseMessage],
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> ChatResult:

        def get_role(name):
            if name == "SystemMessage":
                return ChatRole.USER

            if name == "HumanMessage":
                return ChatRole.USER

            return ChatRole.USER

        resp = None
        try:
        
            task = QnATask(
               prompt="",
               datas=[],
               urls=[],
               tools=self.tools,
               notes="",
               messages=[ChatMessage(role=get_role(type(m).__name__), content=m.content) for m in messages]
            )
            
            responses = vali_dendrite.query(
                axons=[subnet.axons[uid] for uid in self.top_miner_uids],
                synapse=task,
                deserialize=False,
                timeout=60,
            )
            for test_resp in responses:
                try:
                    if "response" in test_resp.response.keys():
                        if self.tools:
                            resp = json.loads(test_resp.response["response"])
                            for msg in resp:
                                if 'role' in msg.keys() and msg['role'] == "tool use":
                                    # resp is probably good
                                    break
                        else:
                            resp = test_resp.response["response"]
                            if resp:
                                break
                                
                except Exception as e:
                    print("SMALLER ERROR: ", e)
                    print(test_resp)
                    
        except Exception as e:
            print("BIGGER ERROR: ", e)
                    
        if not resp:
            print("OMG BIG ERROR (NO RESP): ", responses)
            for respo in responses:
                print(respo.dendrite.status_code)
                print(respo.axon.status_code)
            
        ai_message_content = []
        if type(resp) == str:
            ai_message_content = resp
        else:
            for mesg in resp:
                new_msg = {'type': 'text', 'text': mesg['content']} # default for non tool calling messages
                if mesg['role'] == "tool call":
                    new_msg['type'] = "tool_call"
                    new_msg['text'] = None
                    new_msg['name'] = mesg['content']['name']
                    new_msg['input'] = mesg['content']['arguments']
    
                ai_message_content.append(new_msg)

        message = AIMessage(
            content=ai_message_content,
            additional_kwargs={},  # Used to add additional payload (e.g., function calling request)
            response_metadata={  # Use for response metadata
                "time_in_seconds": 3,
            },
        )
        generation = ChatGeneration(message=message)
        return ChatResult(generations=[generation])

    @property
    def _llm_type(self) -> str:
        return "echoing-chat-model-advanced"

    @property
    def _identifying_params(self) -> Dict[str, Any]:
        return {
            "model_name": "WHATEVER"
        }

In [4]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_benchmarks.tool_usage.agents import StandardAgentFactory
from langchain_benchmarks import registry

model = CustomChatModelAdvanced()

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "{instructions}"),  # Populated from task.instructions automatically
        (
            "user",
            "{question}",
        ),  # Each evaluation example is associated with a question
        ("placeholder", "{agent_scratchpad}"),  # Space for the agent to do work
    ]
)


In [5]:
import datetime
import uuid

from langsmith.client import Client

from langchain_benchmarks import (
    __version__,
    clone_public_dataset,
    model_registry,
    registry,
)
from langchain_benchmarks.rate_limiting import RateLimiter

## Run Benchmark Tests

In [6]:
benchmark_local = False
benchmark_langsmith = True

### BenchMark Local

In [7]:
import os
from langchain_benchmarks.utils import run_without_langsmith

tests = [
    ("Mytest", CustomChatModelAdvanced())
]

if benchmark_local:
    for task in registry.tasks:
        if task.type != "ToolUsageTask":
            continue
        
        dataset_name = task.name + f" ({today})"
        clone_public_dataset(task.dataset_id, dataset_name=dataset_name)
    
        for model_name, model in tests:
            print()
            print(f"Benchmarking {task.name} with model: {model_name}")
            if task.name in ["Tool Usage - Relational Data","Multiverse Math"]:
                eval_config = task.get_eval_config(eval_llm=CustomChatModelAdvanced())
            else:
                eval_config = task.get_eval_config()
            
            agent_factory = StandardAgentFactory(
                task, model, prompt, rate_limiter=rate_limiter
            )
    
            test_run = run_without_langsmith(
                # This will clone the dataset locally if not already there
                path_or_token_id=task.dataset_id,
                llm_or_chain_factory=agent_factory,
                evaluation=eval_config,
                concurrency_level=1,
                verbose=True,
            )

### Benchmark with LangSmith

In [13]:
if benchmark_langsmith:
       
    from langsmith.client import Client
    experiment_id = uuid.uuid4().hex[:]
    client = Client()  # Launch langsmith client for cloning datasets
    today = datetime.date.today().isoformat()
    
    # You can use an optional rate limiter to rate limit your requests!
    rate_limiter = RateLimiter(requests_per_second=1)
    
    for task in registry.tasks:
        if task.type != "ToolUsageTask":
            continue
    
        dataset_name = task.name + f" ({today})"
        clone_public_dataset(task.dataset_id, dataset_name=dataset_name)
    
        for model_name, model in tests:
            print()
            print(f"Benchmarking {task.name} with model: {model_name}")
            if task.name in ["Tool Usage - Relational Data","Multiverse Math"]:
                eval_config = task.get_eval_config(eval_llm=CustomChatModelAdvanced())
            else:
                eval_config = task.get_eval_config()    
                
            agent_factory = StandardAgentFactory(
                task, model, prompt, rate_limiter=rate_limiter
            )
    
            client.run_on_dataset(
                dataset_name=dataset_name,
                llm_or_chain_factory=agent_factory,
                evaluation=eval_config,
                verbose=False,
                project_name=f"{model_name}-{task.name}-{today}-{experiment_id}",
                concurrency_level=1,
                project_metadata={
                    "model": model_name,
                    "id": experiment_id,
                    "task": task.name,
                    "date": today,
                    "langchain_benchmarks_version": __version__,
                },
            )

Dataset Tool Usage - Typewriter (1 tool) (2024-07-08) already exists. Skipping.
You can access the dataset at https://smith.langchain.com/o/9796c4da-021f-5ea2-ad5c-978e23367525/datasets/5bbaa6ff-969d-49fc-a319-060ec17edb3b.

Benchmarking Tool Usage - Typewriter (1 tool) with model: Mytest
View the evaluation results for project 'Mytest-Tool Usage - Typewriter (1 tool)-2024-07-08-e8c1ff482d3d4bffa6d7a11771058bfb' at:
https://smith.langchain.com/o/9796c4da-021f-5ea2-ad5c-978e23367525/datasets/5bbaa6ff-969d-49fc-a319-060ec17edb3b/compare?selectedSessions=ed660500-22c5-47b1-84ee-af411768a46e

View all tests for Dataset Tool Usage - Typewriter (1 tool) (2024-07-08) at:
https://smith.langchain.com/o/9796c4da-021f-5ea2-ad5c-978e23367525/datasets/5bbaa6ff-969d-49fc-a319-060ec17edb3b
[------------------------------------------------->] 20/20Dataset Tool Usage - Typewriter (26 tools) (2024-07-08) already exists. Skipping.
You can access the dataset at https://smith.langchain.com/o/9796c4da-021f-

  0%|          | 0/20 [00:00<?, ?it/s]

Finished fetching examples. Creating dataset...
New dataset created you can access it at https://smith.langchain.com/o/9796c4da-021f-5ea2-ad5c-978e23367525/datasets/05875ab0-3b01-4bea-ae6f-89d3a4f79007.
Done creating dataset.

Benchmarking Multiverse Math with model: Mytest
View the evaluation results for project 'Mytest-Multiverse Math-2024-07-08-e8c1ff482d3d4bffa6d7a11771058bfb' at:
https://smith.langchain.com/o/9796c4da-021f-5ea2-ad5c-978e23367525/datasets/05875ab0-3b01-4bea-ae6f-89d3a4f79007/compare?selectedSessions=52e34faf-ef12-47a4-9907-8aae6518eb7a

View all tests for Dataset Multiverse Math (2024-07-08) at:
https://smith.langchain.com/o/9796c4da-021f-5ea2-ad5c-978e23367525/datasets/05875ab0-3b01-4bea-ae6f-89d3a4f79007
[------------------------------------------------->] 20/20