Spaces:

gorilla-llm
/

berkeley-function-calling-leaderboard

Running

App Files Files Community

Huanzhi Mao commited on Apr 1, 2024

Commit

c94dd2f

1 Parent(s): 23ba85c

update description

Browse files

Files changed (1) hide show

app.py +166 -79

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import os
 import re
 import pandas as pd
 import csv
 # from anthropic import Anthropic
 from openai import OpenAI
 from mistralai.client import MistralClient
@@ -632,12 +633,26 @@ COLUMNS = [
     "Latency Standard Deviation (s)",
 ]
 def parse_csv(text):
-    lines = text.split('\n')
     lines = lines[1:]
     result = []
     for i in range(len(lines)):
-        row = lines[i].split(',')
         row = [parse_value(value) for value in row]
         row.pop(3)
         row.pop(5)
@@ -647,12 +662,13 @@ def parse_csv(text):
         row.pop(6)
         row.pop(10)
         row.pop(10)
         result.append(row)
     return result
 def parse_value(value):
-    if value.endswith('%'):
         return float(value[:-1])
     try:
         return float(value)
@@ -660,54 +676,57 @@ def parse_value(value):
         return value
-with open('./data.csv', 'r') as file:
     csv_text = file.read()
     DATA = parse_csv(csv_text)
 MODELS = [
     "gorilla-openfunctions-v2",
     "gpt-4-1106-preview-fc",
     "gpt-4-0125-preview-fc",
     "gpt-3.5-turbo-0125-fc",
-    "mistral-large-fc"
 ]
 def send_feedback(prompt, function, model, temperature, codeOutput, jsonOutput, vote):
     # Login and get access token
-    login_url = 'https://us-west-2.aws.realm.mongodb.com/api/client/v2.0/app/data-onwzq/auth/providers/local-userpass/login'
-    headers = {'Content-Type': 'application/json'}
-    login_data = {
-        'username': 'website',
-        'password': mongoDBPassword
-    }
     response = requests.post(login_url, headers=headers, json=login_data)
-    access_token = response.json()['access_token']
     # Prepare data for sending feedback
-    url = 'https://us-west-2.aws.data.mongodb-api.com/app/data-onwzq/endpoint/data/v1/action/insertOne'
     headers = {
-        'Content-Type': 'application/json',
-        'Access-Control-Request-Headers': '*',
-        'Authorization': f'Bearer {access_token}'
     }
     if not prompt or not function:
         return
     body = {
-        'collection': "vote",
-        'database': "gorilla-feedback",
-        'dataSource': "gorilla",
-        'document': {
-            'prompt': prompt,
-            'funcDef': function,
-            'temperature': temperature,
-            'model': model,
-            'codeOutput': codeOutput,
-            'jsonOutput': jsonOutput,
-            'result': vote
-        }
     }
     # Send feedback
@@ -715,60 +734,79 @@ def send_feedback(prompt, function, model, temperature, codeOutput, jsonOutput,
     if response.ok:
         print("Document inserted:", response.json())
     else:
-        print('Error:', response.text)
 def get_voting_result():
-    login_url = 'https://us-west-2.aws.realm.mongodb.com/api/client/v2.0/app/data-onwzq/auth/providers/local-userpass/login'
-    headers = {'Content-Type': 'application/json'}
-    login_data = {
-        'username': 'website',
-        'password': mongoDBPassword
-    }
     response = requests.post(login_url, headers=headers, json=login_data)
-    access_token = response.json()['access_token']
     # Scanning the database
-    url = 'https://us-west-2.aws.data.mongodb-api.com/app/data-onwzq/endpoint/data/v1/action/find'
     headers = {
-        'Content-Type': 'application/json',
-        'Access-Control-Request-Headers': '*',
-        'Authorization': f'Bearer {access_token}'
     }
     body = {
-        'collection': "vote",
-        'database': "gorilla-feedback",
-        'dataSource': "gorilla",
     }
     response = requests.post(url, headers=headers, json=body)
     if response.ok:
         data = response.json()
-        votes = data['documents']
-        votes = [vote for vote in votes if vote['result'] in ['positive', 'negative']]
         # extract only the model, positive count, negative count
         model_votes = {}
         for vote in votes:
-            model = vote['model']
             if model not in model_votes:
-                model_votes[model] = {'positive': 0, 'negative': 0}
-            model_votes[model][vote['result']] += 1
         for model in model_votes:
-            model_votes[model]['accuracy'] = model_votes[model]['positive'] / (model_votes[model]['positive'] + model_votes[model]['negative'])
         result = []
         for model in model_votes:
-            result.append([model, model_votes[model]['accuracy'], model_votes[model]['positive'], model_votes[model]['negative']])
         result = sorted(result, key=lambda x: x[1], reverse=True)
-        return pd.DataFrame(result, columns=['Model', 'Accuracy', 'Positive', 'Negative'])
     else:
-        print('Error:', response.text)
         return []
-def send_feedback_negative(prompt, function, model, temperature, codeOutput, jsonOutput):
-    send_feedback(prompt, function, model, temperature, codeOutput, jsonOutput, "negative")
     return "Thank you for your feedback. We will use this to improve our service."
-def send_feedback_positive(prompt, function, model, temperature, codeOutput, jsonOutput):
-    send_feedback(prompt, function, model, temperature, codeOutput, jsonOutput, "positive")
     return "Thank you for your feedback. We will use this to improve our service."
@@ -905,7 +943,7 @@ def get_openai_response(prompt, function, model, temperature):
 def get_mistral_response(prompt, function, model, temperature):
-    client = MistralClient(api_key= mistralKey)
     oai_tool = []
     function = json.loads(function)
     item = function  # use item in the later code
@@ -913,7 +951,9 @@ def get_mistral_response(prompt, function, model, temperature):
         item["name"] = re.sub(
             r"\.", "_", item["name"]
         )  # OAI does not support "." in the function name so we replace it with "_". ^[a-zA-Z0-9_-]{1,64}$ is the regex for the name.
-    item["parameters"]["type"] = "object"  # If typing is missing, we assume it is an object since OAI requires a type.
     if "properties" not in item["parameters"]:
         item["parameters"]["properties"] = item["parameters"].copy()
         item["parameters"]["type"] = "object"
@@ -928,12 +968,12 @@ def get_mistral_response(prompt, function, model, temperature):
     )
     oai_tool.append({"type": "function", "function": item})
     message = [
-            ChatMessage(role="user", content=prompt),
-        ]
     chat_response = client.chat(
         model="mistral-large-latest",
         messages=message,
-        tools = oai_tool,
         temperature=temperature,
     )
     try:
@@ -949,8 +989,8 @@ def get_mistral_response(prompt, function, model, temperature):
     except:
         result = chat_response.choices[0].message.content
         return result, "The model failed to return a JSON output."
 def distribute_task(prompt, function, model, temperature):
     if "gpt" in model:
         return get_openai_response(prompt, function, model, temperature)
@@ -968,6 +1008,13 @@ def get_leaderboard():
     return leaderboard_df
 prompt = gr.Textbox(label="Prompt", placeholder="Type your prompt here...", lines=4)
 funcDescription = gr.Textbox(
     label="Function Description", placeholder="Describe the function...", lines=20
@@ -977,14 +1024,40 @@ model = gr.Dropdown(label="Model", choices=MODELS)
 with gr.Blocks() as demo:
     with gr.Tabs():
-        with gr.TabItem("Leaderboard"):
-            gr.Markdown("**This live leaderboard evaluates the LLM's ability to call functions (aka tools) accurately. This leaderboard consists of real-world data and will be updated periodically. For more information on the evaluation dataset and methodology, please refer to our [blog](https://gorilla.cs.berkeley.edu/blogs/8_berkeley_function_calling_leaderboard.html) and [code](https://github.com/ShishirPatil/gorilla).**")
             gr.Markdown(
-                "**Note: AST means evaluation through Abstract Syntax Tree and Exec means evaluation through execution.**"
             )
-            leaderboard_data = gr.Dataframe(
-                value=get_leaderboard(), wrap=True
             )
         with gr.TabItem("Try It Out"):
             with gr.Row():
@@ -1050,18 +1123,32 @@ with gr.Blocks() as demo:
                 fn=None,
                 inputs=[prompt, model, temperature, codeOutput, jsonOutput],
                 outputs=[],
-                js='(prompt, model, temperature, codeOutput, jsonOutput) => window.open(`https://github.com/ShishirPatil/gorilla/issues/new?assignees=&labels=hosted-openfunctions-v2&projects=&template=hosted-openfunctions-v2.md&title=[bug] OpenFunctions-v2: &body=**Issue Description**%0A%0APrompt: ${prompt}%0A%0AModel: ${model}%0A%0ATemperature: ${temperature}%0A%0AOutput (or Error if request failed): ${codeOutput} %0A%0A ${jsonOutput}%0A%0A**Additional Information**\n`, "_blank")'
             )
             thumbs_up.click(
                 fn=send_feedback_positive,
-                inputs=[prompt, funcDescription, model, temperature, codeOutput, jsonOutput],
                 outputs=[feedbackMsg],
             )
             thumbs_down.click(
                 fn=send_feedback_negative,
-                inputs=[prompt, funcDescription, model, temperature, codeOutput, jsonOutput],
                 outputs=[feedbackMsg],
             )
@@ -1070,5 +1157,5 @@ with gr.Blocks() as demo:
         #     leaderboard_data = gr.Dataframe(
         #         value=get_voting_result(), wrap=True
         #     )
 demo.launch()

 import re
 import pandas as pd
 import csv
 # from anthropic import Anthropic
 from openai import OpenAI
 from mistralai.client import MistralClient
     "Latency Standard Deviation (s)",
 ]
+COLUMNS_SUMMARY = [
+    "Rank",
+    "Overall Acc",
+    "Model",
+    "Organization",
+    "License",
+    "AST Summary",
+    "Exec Summary",
+    "Relevance Detection",
+    "Cost ($ Per 1k Function Calls)",
+    "Latency Mean (s)",
+]
 def parse_csv(text):
+    lines = text.split("\n")
     lines = lines[1:]
     result = []
     for i in range(len(lines)):
+        row = lines[i].split(",")
         row = [parse_value(value) for value in row]
         row.pop(3)
         row.pop(5)
         row.pop(6)
         row.pop(10)
         row.pop(10)
         result.append(row)
     return result
 def parse_value(value):
+    if value.endswith("%"):
         return float(value[:-1])
     try:
         return float(value)
         return value
+with open("./data.csv", "r") as file:
     csv_text = file.read()
     DATA = parse_csv(csv_text)
+    DATA_SUMMARY = [
+        row[:5]
+        + [round((row[5] + row[6] + row[7] + row[8]) / 4, 2)]
+        + [round((row[9] + row[10] + row[11] + row[12]) / 4, 2)]
+        + row[13:16]
+        for row in DATA
+    ]
 MODELS = [
     "gorilla-openfunctions-v2",
     "gpt-4-1106-preview-fc",
     "gpt-4-0125-preview-fc",
     "gpt-3.5-turbo-0125-fc",
+    "mistral-large-fc",
 ]
 def send_feedback(prompt, function, model, temperature, codeOutput, jsonOutput, vote):
     # Login and get access token
+    login_url = "https://us-west-2.aws.realm.mongodb.com/api/client/v2.0/app/data-onwzq/auth/providers/local-userpass/login"
+    headers = {"Content-Type": "application/json"}
+    login_data = {"username": "website", "password": mongoDBPassword}
     response = requests.post(login_url, headers=headers, json=login_data)
+    access_token = response.json()["access_token"]
     # Prepare data for sending feedback
+    url = "https://us-west-2.aws.data.mongodb-api.com/app/data-onwzq/endpoint/data/v1/action/insertOne"
     headers = {
+        "Content-Type": "application/json",
+        "Access-Control-Request-Headers": "*",
+        "Authorization": f"Bearer {access_token}",
     }
     if not prompt or not function:
         return
     body = {
+        "collection": "vote",
+        "database": "gorilla-feedback",
+        "dataSource": "gorilla",
+        "document": {
+            "prompt": prompt,
+            "funcDef": function,
+            "temperature": temperature,
+            "model": model,
+            "codeOutput": codeOutput,
+            "jsonOutput": jsonOutput,
+            "result": vote,
+        },
     }
     # Send feedback
     if response.ok:
         print("Document inserted:", response.json())
     else:
+        print("Error:", response.text)
 def get_voting_result():
+    login_url = "https://us-west-2.aws.realm.mongodb.com/api/client/v2.0/app/data-onwzq/auth/providers/local-userpass/login"
+    headers = {"Content-Type": "application/json"}
+    login_data = {"username": "website", "password": mongoDBPassword}
     response = requests.post(login_url, headers=headers, json=login_data)
+    access_token = response.json()["access_token"]
     # Scanning the database
+    url = "https://us-west-2.aws.data.mongodb-api.com/app/data-onwzq/endpoint/data/v1/action/find"
     headers = {
+        "Content-Type": "application/json",
+        "Access-Control-Request-Headers": "*",
+        "Authorization": f"Bearer {access_token}",
     }
     body = {
+        "collection": "vote",
+        "database": "gorilla-feedback",
+        "dataSource": "gorilla",
     }
     response = requests.post(url, headers=headers, json=body)
     if response.ok:
         data = response.json()
+        votes = data["documents"]
+        votes = [vote for vote in votes if vote["result"] in ["positive", "negative"]]
         # extract only the model, positive count, negative count
         model_votes = {}
         for vote in votes:
+            model = vote["model"]
             if model not in model_votes:
+                model_votes[model] = {"positive": 0, "negative": 0}
+            model_votes[model][vote["result"]] += 1
         for model in model_votes:
+            model_votes[model]["accuracy"] = model_votes[model]["positive"] / (
+                model_votes[model]["positive"] + model_votes[model]["negative"]
+            )
         result = []
         for model in model_votes:
+            result.append(
+                [
+                    model,
+                    model_votes[model]["accuracy"],
+                    model_votes[model]["positive"],
+                    model_votes[model]["negative"],
+                ]
+            )
         result = sorted(result, key=lambda x: x[1], reverse=True)
+        return pd.DataFrame(
+            result, columns=["Model", "Accuracy", "Positive", "Negative"]
+        )
     else:
+        print("Error:", response.text)
         return []
+def send_feedback_negative(
+    prompt, function, model, temperature, codeOutput, jsonOutput
+):
+    send_feedback(
+        prompt, function, model, temperature, codeOutput, jsonOutput, "negative"
+    )
     return "Thank you for your feedback. We will use this to improve our service."
+def send_feedback_positive(
+    prompt, function, model, temperature, codeOutput, jsonOutput
+):
+    send_feedback(
+        prompt, function, model, temperature, codeOutput, jsonOutput, "positive"
+    )
     return "Thank you for your feedback. We will use this to improve our service."
 def get_mistral_response(prompt, function, model, temperature):
+    client = MistralClient(api_key=mistralKey)
     oai_tool = []
     function = json.loads(function)
     item = function  # use item in the later code
         item["name"] = re.sub(
             r"\.", "_", item["name"]
         )  # OAI does not support "." in the function name so we replace it with "_". ^[a-zA-Z0-9_-]{1,64}$ is the regex for the name.
+    item["parameters"][
+        "type"
+    ] = "object"  # If typing is missing, we assume it is an object since OAI requires a type.
     if "properties" not in item["parameters"]:
         item["parameters"]["properties"] = item["parameters"].copy()
         item["parameters"]["type"] = "object"
     )
     oai_tool.append({"type": "function", "function": item})
     message = [
+        ChatMessage(role="user", content=prompt),
+    ]
     chat_response = client.chat(
         model="mistral-large-latest",
         messages=message,
+        tools=oai_tool,
         temperature=temperature,
     )
     try:
     except:
         result = chat_response.choices[0].message.content
         return result, "The model failed to return a JSON output."
 def distribute_task(prompt, function, model, temperature):
     if "gpt" in model:
         return get_openai_response(prompt, function, model, temperature)
     return leaderboard_df
+def get_summary():
+    # Convert the leaderboard data to a pandas DataFrame for easier handling and display
+    leaderboard_df = pd.DataFrame(DATA_SUMMARY, columns=COLUMNS_SUMMARY)
+    leaderboard_df = leaderboard_df.sort_values(by="Rank")
+    return leaderboard_df
 prompt = gr.Textbox(label="Prompt", placeholder="Type your prompt here...", lines=4)
 funcDescription = gr.Textbox(
     label="Function Description", placeholder="Describe the function...", lines=20
 with gr.Blocks() as demo:
     with gr.Tabs():
+        with gr.TabItem("Summary Leaderboard"):
+            gr.Markdown(
+                "**This live leaderboard evaluates the LLM's ability to call functions (aka tools) accurately. This leaderboard consists of real-world data and will be updated periodically. For more information on the evaluation dataset and methodology, please refer to our [blog](https://gorilla.cs.berkeley.edu/blogs/10_checker_manual.html) and [code](https://github.com/ShishirPatil/gorilla).**"
+            )
+            gr.Markdown(
+                """**AST means evaluation through Abstract Syntax Tree and Exec means evaluation through execution.**
+                **FC = native support for function/tool calling.**
+                **Cost is calculated as an estimate of the cost per 1000 function calls, in USD. Latency is measured in seconds.**
+                **AST Summary is the unweighted average of the four test categories under AST Evaluation. Exec Summary is the unweighted average of the four test categories under Exec Evaluation.**
+                **Click on column header to sort. If you would like to add your model or contribute test-cases, please contact us via [discord](https://discord.gg/SwTyuTAxX3).**
+                """
+            )
+            leaderboard_data = gr.Dataframe(value=get_summary(), wrap=True)
+        with gr.TabItem("Full Leaderboard"):
             gr.Markdown(
+                "**This live leaderboard evaluates the LLM's ability to call functions (aka tools) accurately. This leaderboard consists of real-world data and will be updated periodically. For more information on the evaluation dataset and methodology, please refer to our [blog](https://gorilla.cs.berkeley.edu/blogs/10_checker_manual.html) and [code](https://github.com/ShishirPatil/gorilla).**"
             )
+            gr.Markdown(
+                """**AST means evaluation through Abstract Syntax Tree and Exec means evaluation through execution.**
+                **FC = native support for function/tool calling.**
+                **Cost is calculated as an estimate of the cost per 1000 function calls, in USD. Latency is measured in seconds.**
+                **AST Summary is the unweighted average of the four test categories under AST Evaluation. Exec Summary is the unweighted average of the four test categories under Exec Evaluation.**
+                **Click on column header to sort. If you would like to add your model or contribute test-cases, please contact us via [discord](https://discord.gg/SwTyuTAxX3).**
+                """
             )
+            leaderboard_data = gr.Dataframe(value=get_leaderboard(), wrap=True)
         with gr.TabItem("Try It Out"):
             with gr.Row():
                 fn=None,
                 inputs=[prompt, model, temperature, codeOutput, jsonOutput],
                 outputs=[],
+                js='(prompt, model, temperature, codeOutput, jsonOutput) => window.open(`https://github.com/ShishirPatil/gorilla/issues/new?assignees=&labels=hosted-openfunctions-v2&projects=&template=hosted-openfunctions-v2.md&title=[bug] OpenFunctions-v2: &body=**Issue Description**%0A%0APrompt: ${prompt}%0A%0AModel: ${model}%0A%0ATemperature: ${temperature}%0A%0AOutput (or Error if request failed): ${codeOutput} %0A%0A ${jsonOutput}%0A%0A**Additional Information**\n`, "_blank")',
             )
             thumbs_up.click(
                 fn=send_feedback_positive,
+                inputs=[
+                    prompt,
+                    funcDescription,
+                    model,
+                    temperature,
+                    codeOutput,
+                    jsonOutput,
+                ],
                 outputs=[feedbackMsg],
             )
             thumbs_down.click(
                 fn=send_feedback_negative,
+                inputs=[
+                    prompt,
+                    funcDescription,
+                    model,
+                    temperature,
+                    codeOutput,
+                    jsonOutput,
+                ],
                 outputs=[feedbackMsg],
             )
         #     leaderboard_data = gr.Dataframe(
         #         value=get_voting_result(), wrap=True
         #     )
 demo.launch()