Huanzhi Mao
commited on
Commit
·
c94dd2f
1
Parent(s):
23ba85c
update description
Browse files
app.py
CHANGED
|
@@ -5,6 +5,7 @@ import os
|
|
| 5 |
import re
|
| 6 |
import pandas as pd
|
| 7 |
import csv
|
|
|
|
| 8 |
# from anthropic import Anthropic
|
| 9 |
from openai import OpenAI
|
| 10 |
from mistralai.client import MistralClient
|
|
@@ -632,12 +633,26 @@ COLUMNS = [
|
|
| 632 |
"Latency Standard Deviation (s)",
|
| 633 |
]
|
| 634 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 635 |
def parse_csv(text):
|
| 636 |
-
lines = text.split(
|
| 637 |
lines = lines[1:]
|
| 638 |
result = []
|
| 639 |
for i in range(len(lines)):
|
| 640 |
-
row = lines[i].split(
|
| 641 |
row = [parse_value(value) for value in row]
|
| 642 |
row.pop(3)
|
| 643 |
row.pop(5)
|
|
@@ -647,12 +662,13 @@ def parse_csv(text):
|
|
| 647 |
row.pop(6)
|
| 648 |
row.pop(10)
|
| 649 |
row.pop(10)
|
| 650 |
-
|
| 651 |
result.append(row)
|
| 652 |
return result
|
| 653 |
|
|
|
|
| 654 |
def parse_value(value):
|
| 655 |
-
if value.endswith(
|
| 656 |
return float(value[:-1])
|
| 657 |
try:
|
| 658 |
return float(value)
|
|
@@ -660,54 +676,57 @@ def parse_value(value):
|
|
| 660 |
return value
|
| 661 |
|
| 662 |
|
| 663 |
-
with open(
|
| 664 |
csv_text = file.read()
|
| 665 |
DATA = parse_csv(csv_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 666 |
|
| 667 |
MODELS = [
|
| 668 |
"gorilla-openfunctions-v2",
|
| 669 |
"gpt-4-1106-preview-fc",
|
| 670 |
"gpt-4-0125-preview-fc",
|
| 671 |
"gpt-3.5-turbo-0125-fc",
|
| 672 |
-
"mistral-large-fc"
|
| 673 |
]
|
| 674 |
|
| 675 |
|
| 676 |
def send_feedback(prompt, function, model, temperature, codeOutput, jsonOutput, vote):
|
| 677 |
# Login and get access token
|
| 678 |
-
login_url =
|
| 679 |
-
headers = {
|
| 680 |
-
login_data = {
|
| 681 |
-
'username': 'website',
|
| 682 |
-
'password': mongoDBPassword
|
| 683 |
-
}
|
| 684 |
response = requests.post(login_url, headers=headers, json=login_data)
|
| 685 |
-
access_token = response.json()[
|
| 686 |
# Prepare data for sending feedback
|
| 687 |
-
url =
|
| 688 |
headers = {
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
|
| 692 |
}
|
| 693 |
|
| 694 |
-
|
| 695 |
if not prompt or not function:
|
| 696 |
return
|
| 697 |
|
| 698 |
body = {
|
| 699 |
-
|
| 700 |
-
|
| 701 |
-
|
| 702 |
-
|
| 703 |
-
|
| 704 |
-
|
| 705 |
-
|
| 706 |
-
|
| 707 |
-
|
| 708 |
-
|
| 709 |
-
|
| 710 |
-
}
|
| 711 |
}
|
| 712 |
|
| 713 |
# Send feedback
|
|
@@ -715,60 +734,79 @@ def send_feedback(prompt, function, model, temperature, codeOutput, jsonOutput,
|
|
| 715 |
if response.ok:
|
| 716 |
print("Document inserted:", response.json())
|
| 717 |
else:
|
| 718 |
-
print(
|
|
|
|
| 719 |
|
| 720 |
def get_voting_result():
|
| 721 |
-
login_url =
|
| 722 |
-
headers = {
|
| 723 |
-
login_data = {
|
| 724 |
-
'username': 'website',
|
| 725 |
-
'password': mongoDBPassword
|
| 726 |
-
}
|
| 727 |
response = requests.post(login_url, headers=headers, json=login_data)
|
| 728 |
-
access_token = response.json()[
|
| 729 |
-
|
| 730 |
# Scanning the database
|
| 731 |
-
url =
|
| 732 |
headers = {
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
|
| 736 |
}
|
| 737 |
body = {
|
| 738 |
-
|
| 739 |
-
|
| 740 |
-
|
| 741 |
}
|
| 742 |
response = requests.post(url, headers=headers, json=body)
|
| 743 |
if response.ok:
|
| 744 |
data = response.json()
|
| 745 |
-
votes = data[
|
| 746 |
-
votes = [vote for vote in votes if vote[
|
| 747 |
# extract only the model, positive count, negative count
|
| 748 |
model_votes = {}
|
| 749 |
for vote in votes:
|
| 750 |
-
model = vote[
|
| 751 |
if model not in model_votes:
|
| 752 |
-
model_votes[model] = {
|
| 753 |
-
model_votes[model][vote[
|
| 754 |
for model in model_votes:
|
| 755 |
-
model_votes[model][
|
| 756 |
-
|
|
|
|
|
|
|
| 757 |
result = []
|
| 758 |
for model in model_votes:
|
| 759 |
-
result.append(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 760 |
result = sorted(result, key=lambda x: x[1], reverse=True)
|
| 761 |
-
return pd.DataFrame(
|
|
|
|
|
|
|
| 762 |
else:
|
| 763 |
-
print(
|
| 764 |
return []
|
| 765 |
-
|
| 766 |
-
|
| 767 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 768 |
return "Thank you for your feedback. We will use this to improve our service."
|
| 769 |
|
| 770 |
-
|
| 771 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 772 |
return "Thank you for your feedback. We will use this to improve our service."
|
| 773 |
|
| 774 |
|
|
@@ -905,7 +943,7 @@ def get_openai_response(prompt, function, model, temperature):
|
|
| 905 |
|
| 906 |
|
| 907 |
def get_mistral_response(prompt, function, model, temperature):
|
| 908 |
-
client = MistralClient(api_key=
|
| 909 |
oai_tool = []
|
| 910 |
function = json.loads(function)
|
| 911 |
item = function # use item in the later code
|
|
@@ -913,7 +951,9 @@ def get_mistral_response(prompt, function, model, temperature):
|
|
| 913 |
item["name"] = re.sub(
|
| 914 |
r"\.", "_", item["name"]
|
| 915 |
) # OAI does not support "." in the function name so we replace it with "_". ^[a-zA-Z0-9_-]{1,64}$ is the regex for the name.
|
| 916 |
-
item["parameters"][
|
|
|
|
|
|
|
| 917 |
if "properties" not in item["parameters"]:
|
| 918 |
item["parameters"]["properties"] = item["parameters"].copy()
|
| 919 |
item["parameters"]["type"] = "object"
|
|
@@ -928,12 +968,12 @@ def get_mistral_response(prompt, function, model, temperature):
|
|
| 928 |
)
|
| 929 |
oai_tool.append({"type": "function", "function": item})
|
| 930 |
message = [
|
| 931 |
-
|
| 932 |
-
|
| 933 |
chat_response = client.chat(
|
| 934 |
model="mistral-large-latest",
|
| 935 |
messages=message,
|
| 936 |
-
tools
|
| 937 |
temperature=temperature,
|
| 938 |
)
|
| 939 |
try:
|
|
@@ -949,8 +989,8 @@ def get_mistral_response(prompt, function, model, temperature):
|
|
| 949 |
except:
|
| 950 |
result = chat_response.choices[0].message.content
|
| 951 |
return result, "The model failed to return a JSON output."
|
| 952 |
-
|
| 953 |
-
|
| 954 |
def distribute_task(prompt, function, model, temperature):
|
| 955 |
if "gpt" in model:
|
| 956 |
return get_openai_response(prompt, function, model, temperature)
|
|
@@ -968,6 +1008,13 @@ def get_leaderboard():
|
|
| 968 |
return leaderboard_df
|
| 969 |
|
| 970 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 971 |
prompt = gr.Textbox(label="Prompt", placeholder="Type your prompt here...", lines=4)
|
| 972 |
funcDescription = gr.Textbox(
|
| 973 |
label="Function Description", placeholder="Describe the function...", lines=20
|
|
@@ -977,14 +1024,40 @@ model = gr.Dropdown(label="Model", choices=MODELS)
|
|
| 977 |
|
| 978 |
with gr.Blocks() as demo:
|
| 979 |
with gr.Tabs():
|
| 980 |
-
with gr.TabItem("Leaderboard"):
|
| 981 |
-
gr.Markdown(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 982 |
gr.Markdown(
|
| 983 |
-
"**
|
| 984 |
)
|
| 985 |
-
|
| 986 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 987 |
)
|
|
|
|
| 988 |
|
| 989 |
with gr.TabItem("Try It Out"):
|
| 990 |
with gr.Row():
|
|
@@ -1050,18 +1123,32 @@ with gr.Blocks() as demo:
|
|
| 1050 |
fn=None,
|
| 1051 |
inputs=[prompt, model, temperature, codeOutput, jsonOutput],
|
| 1052 |
outputs=[],
|
| 1053 |
-
js='(prompt, model, temperature, codeOutput, jsonOutput) => window.open(`https://github.com/ShishirPatil/gorilla/issues/new?assignees=&labels=hosted-openfunctions-v2&projects=&template=hosted-openfunctions-v2.md&title=[bug] OpenFunctions-v2: &body=**Issue Description**%0A%0APrompt: ${prompt}%0A%0AModel: ${model}%0A%0ATemperature: ${temperature}%0A%0AOutput (or Error if request failed): ${codeOutput} %0A%0A ${jsonOutput}%0A%0A**Additional Information**\n`, "_blank")'
|
| 1054 |
)
|
| 1055 |
-
|
| 1056 |
thumbs_up.click(
|
| 1057 |
fn=send_feedback_positive,
|
| 1058 |
-
inputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1059 |
outputs=[feedbackMsg],
|
| 1060 |
)
|
| 1061 |
-
|
| 1062 |
thumbs_down.click(
|
| 1063 |
fn=send_feedback_negative,
|
| 1064 |
-
inputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1065 |
outputs=[feedbackMsg],
|
| 1066 |
)
|
| 1067 |
|
|
@@ -1070,5 +1157,5 @@ with gr.Blocks() as demo:
|
|
| 1070 |
# leaderboard_data = gr.Dataframe(
|
| 1071 |
# value=get_voting_result(), wrap=True
|
| 1072 |
# )
|
| 1073 |
-
|
| 1074 |
demo.launch()
|
|
|
|
| 5 |
import re
|
| 6 |
import pandas as pd
|
| 7 |
import csv
|
| 8 |
+
|
| 9 |
# from anthropic import Anthropic
|
| 10 |
from openai import OpenAI
|
| 11 |
from mistralai.client import MistralClient
|
|
|
|
| 633 |
"Latency Standard Deviation (s)",
|
| 634 |
]
|
| 635 |
|
| 636 |
+
COLUMNS_SUMMARY = [
|
| 637 |
+
"Rank",
|
| 638 |
+
"Overall Acc",
|
| 639 |
+
"Model",
|
| 640 |
+
"Organization",
|
| 641 |
+
"License",
|
| 642 |
+
"AST Summary",
|
| 643 |
+
"Exec Summary",
|
| 644 |
+
"Relevance Detection",
|
| 645 |
+
"Cost ($ Per 1k Function Calls)",
|
| 646 |
+
"Latency Mean (s)",
|
| 647 |
+
]
|
| 648 |
+
|
| 649 |
+
|
| 650 |
def parse_csv(text):
|
| 651 |
+
lines = text.split("\n")
|
| 652 |
lines = lines[1:]
|
| 653 |
result = []
|
| 654 |
for i in range(len(lines)):
|
| 655 |
+
row = lines[i].split(",")
|
| 656 |
row = [parse_value(value) for value in row]
|
| 657 |
row.pop(3)
|
| 658 |
row.pop(5)
|
|
|
|
| 662 |
row.pop(6)
|
| 663 |
row.pop(10)
|
| 664 |
row.pop(10)
|
| 665 |
+
|
| 666 |
result.append(row)
|
| 667 |
return result
|
| 668 |
|
| 669 |
+
|
| 670 |
def parse_value(value):
|
| 671 |
+
if value.endswith("%"):
|
| 672 |
return float(value[:-1])
|
| 673 |
try:
|
| 674 |
return float(value)
|
|
|
|
| 676 |
return value
|
| 677 |
|
| 678 |
|
| 679 |
+
with open("./data.csv", "r") as file:
|
| 680 |
csv_text = file.read()
|
| 681 |
DATA = parse_csv(csv_text)
|
| 682 |
+
DATA_SUMMARY = [
|
| 683 |
+
row[:5]
|
| 684 |
+
+ [round((row[5] + row[6] + row[7] + row[8]) / 4, 2)]
|
| 685 |
+
+ [round((row[9] + row[10] + row[11] + row[12]) / 4, 2)]
|
| 686 |
+
+ row[13:16]
|
| 687 |
+
for row in DATA
|
| 688 |
+
]
|
| 689 |
|
| 690 |
MODELS = [
|
| 691 |
"gorilla-openfunctions-v2",
|
| 692 |
"gpt-4-1106-preview-fc",
|
| 693 |
"gpt-4-0125-preview-fc",
|
| 694 |
"gpt-3.5-turbo-0125-fc",
|
| 695 |
+
"mistral-large-fc",
|
| 696 |
]
|
| 697 |
|
| 698 |
|
| 699 |
def send_feedback(prompt, function, model, temperature, codeOutput, jsonOutput, vote):
|
| 700 |
# Login and get access token
|
| 701 |
+
login_url = "https://us-west-2.aws.realm.mongodb.com/api/client/v2.0/app/data-onwzq/auth/providers/local-userpass/login"
|
| 702 |
+
headers = {"Content-Type": "application/json"}
|
| 703 |
+
login_data = {"username": "website", "password": mongoDBPassword}
|
|
|
|
|
|
|
|
|
|
| 704 |
response = requests.post(login_url, headers=headers, json=login_data)
|
| 705 |
+
access_token = response.json()["access_token"]
|
| 706 |
# Prepare data for sending feedback
|
| 707 |
+
url = "https://us-west-2.aws.data.mongodb-api.com/app/data-onwzq/endpoint/data/v1/action/insertOne"
|
| 708 |
headers = {
|
| 709 |
+
"Content-Type": "application/json",
|
| 710 |
+
"Access-Control-Request-Headers": "*",
|
| 711 |
+
"Authorization": f"Bearer {access_token}",
|
| 712 |
}
|
| 713 |
|
|
|
|
| 714 |
if not prompt or not function:
|
| 715 |
return
|
| 716 |
|
| 717 |
body = {
|
| 718 |
+
"collection": "vote",
|
| 719 |
+
"database": "gorilla-feedback",
|
| 720 |
+
"dataSource": "gorilla",
|
| 721 |
+
"document": {
|
| 722 |
+
"prompt": prompt,
|
| 723 |
+
"funcDef": function,
|
| 724 |
+
"temperature": temperature,
|
| 725 |
+
"model": model,
|
| 726 |
+
"codeOutput": codeOutput,
|
| 727 |
+
"jsonOutput": jsonOutput,
|
| 728 |
+
"result": vote,
|
| 729 |
+
},
|
| 730 |
}
|
| 731 |
|
| 732 |
# Send feedback
|
|
|
|
| 734 |
if response.ok:
|
| 735 |
print("Document inserted:", response.json())
|
| 736 |
else:
|
| 737 |
+
print("Error:", response.text)
|
| 738 |
+
|
| 739 |
|
| 740 |
def get_voting_result():
|
| 741 |
+
login_url = "https://us-west-2.aws.realm.mongodb.com/api/client/v2.0/app/data-onwzq/auth/providers/local-userpass/login"
|
| 742 |
+
headers = {"Content-Type": "application/json"}
|
| 743 |
+
login_data = {"username": "website", "password": mongoDBPassword}
|
|
|
|
|
|
|
|
|
|
| 744 |
response = requests.post(login_url, headers=headers, json=login_data)
|
| 745 |
+
access_token = response.json()["access_token"]
|
| 746 |
+
|
| 747 |
# Scanning the database
|
| 748 |
+
url = "https://us-west-2.aws.data.mongodb-api.com/app/data-onwzq/endpoint/data/v1/action/find"
|
| 749 |
headers = {
|
| 750 |
+
"Content-Type": "application/json",
|
| 751 |
+
"Access-Control-Request-Headers": "*",
|
| 752 |
+
"Authorization": f"Bearer {access_token}",
|
| 753 |
}
|
| 754 |
body = {
|
| 755 |
+
"collection": "vote",
|
| 756 |
+
"database": "gorilla-feedback",
|
| 757 |
+
"dataSource": "gorilla",
|
| 758 |
}
|
| 759 |
response = requests.post(url, headers=headers, json=body)
|
| 760 |
if response.ok:
|
| 761 |
data = response.json()
|
| 762 |
+
votes = data["documents"]
|
| 763 |
+
votes = [vote for vote in votes if vote["result"] in ["positive", "negative"]]
|
| 764 |
# extract only the model, positive count, negative count
|
| 765 |
model_votes = {}
|
| 766 |
for vote in votes:
|
| 767 |
+
model = vote["model"]
|
| 768 |
if model not in model_votes:
|
| 769 |
+
model_votes[model] = {"positive": 0, "negative": 0}
|
| 770 |
+
model_votes[model][vote["result"]] += 1
|
| 771 |
for model in model_votes:
|
| 772 |
+
model_votes[model]["accuracy"] = model_votes[model]["positive"] / (
|
| 773 |
+
model_votes[model]["positive"] + model_votes[model]["negative"]
|
| 774 |
+
)
|
| 775 |
+
|
| 776 |
result = []
|
| 777 |
for model in model_votes:
|
| 778 |
+
result.append(
|
| 779 |
+
[
|
| 780 |
+
model,
|
| 781 |
+
model_votes[model]["accuracy"],
|
| 782 |
+
model_votes[model]["positive"],
|
| 783 |
+
model_votes[model]["negative"],
|
| 784 |
+
]
|
| 785 |
+
)
|
| 786 |
result = sorted(result, key=lambda x: x[1], reverse=True)
|
| 787 |
+
return pd.DataFrame(
|
| 788 |
+
result, columns=["Model", "Accuracy", "Positive", "Negative"]
|
| 789 |
+
)
|
| 790 |
else:
|
| 791 |
+
print("Error:", response.text)
|
| 792 |
return []
|
| 793 |
+
|
| 794 |
+
|
| 795 |
+
def send_feedback_negative(
|
| 796 |
+
prompt, function, model, temperature, codeOutput, jsonOutput
|
| 797 |
+
):
|
| 798 |
+
send_feedback(
|
| 799 |
+
prompt, function, model, temperature, codeOutput, jsonOutput, "negative"
|
| 800 |
+
)
|
| 801 |
return "Thank you for your feedback. We will use this to improve our service."
|
| 802 |
|
| 803 |
+
|
| 804 |
+
def send_feedback_positive(
|
| 805 |
+
prompt, function, model, temperature, codeOutput, jsonOutput
|
| 806 |
+
):
|
| 807 |
+
send_feedback(
|
| 808 |
+
prompt, function, model, temperature, codeOutput, jsonOutput, "positive"
|
| 809 |
+
)
|
| 810 |
return "Thank you for your feedback. We will use this to improve our service."
|
| 811 |
|
| 812 |
|
|
|
|
| 943 |
|
| 944 |
|
| 945 |
def get_mistral_response(prompt, function, model, temperature):
|
| 946 |
+
client = MistralClient(api_key=mistralKey)
|
| 947 |
oai_tool = []
|
| 948 |
function = json.loads(function)
|
| 949 |
item = function # use item in the later code
|
|
|
|
| 951 |
item["name"] = re.sub(
|
| 952 |
r"\.", "_", item["name"]
|
| 953 |
) # OAI does not support "." in the function name so we replace it with "_". ^[a-zA-Z0-9_-]{1,64}$ is the regex for the name.
|
| 954 |
+
item["parameters"][
|
| 955 |
+
"type"
|
| 956 |
+
] = "object" # If typing is missing, we assume it is an object since OAI requires a type.
|
| 957 |
if "properties" not in item["parameters"]:
|
| 958 |
item["parameters"]["properties"] = item["parameters"].copy()
|
| 959 |
item["parameters"]["type"] = "object"
|
|
|
|
| 968 |
)
|
| 969 |
oai_tool.append({"type": "function", "function": item})
|
| 970 |
message = [
|
| 971 |
+
ChatMessage(role="user", content=prompt),
|
| 972 |
+
]
|
| 973 |
chat_response = client.chat(
|
| 974 |
model="mistral-large-latest",
|
| 975 |
messages=message,
|
| 976 |
+
tools=oai_tool,
|
| 977 |
temperature=temperature,
|
| 978 |
)
|
| 979 |
try:
|
|
|
|
| 989 |
except:
|
| 990 |
result = chat_response.choices[0].message.content
|
| 991 |
return result, "The model failed to return a JSON output."
|
| 992 |
+
|
| 993 |
+
|
| 994 |
def distribute_task(prompt, function, model, temperature):
|
| 995 |
if "gpt" in model:
|
| 996 |
return get_openai_response(prompt, function, model, temperature)
|
|
|
|
| 1008 |
return leaderboard_df
|
| 1009 |
|
| 1010 |
|
| 1011 |
+
def get_summary():
|
| 1012 |
+
# Convert the leaderboard data to a pandas DataFrame for easier handling and display
|
| 1013 |
+
leaderboard_df = pd.DataFrame(DATA_SUMMARY, columns=COLUMNS_SUMMARY)
|
| 1014 |
+
leaderboard_df = leaderboard_df.sort_values(by="Rank")
|
| 1015 |
+
return leaderboard_df
|
| 1016 |
+
|
| 1017 |
+
|
| 1018 |
prompt = gr.Textbox(label="Prompt", placeholder="Type your prompt here...", lines=4)
|
| 1019 |
funcDescription = gr.Textbox(
|
| 1020 |
label="Function Description", placeholder="Describe the function...", lines=20
|
|
|
|
| 1024 |
|
| 1025 |
with gr.Blocks() as demo:
|
| 1026 |
with gr.Tabs():
|
| 1027 |
+
with gr.TabItem("Summary Leaderboard"):
|
| 1028 |
+
gr.Markdown(
|
| 1029 |
+
"**This live leaderboard evaluates the LLM's ability to call functions (aka tools) accurately. This leaderboard consists of real-world data and will be updated periodically. For more information on the evaluation dataset and methodology, please refer to our [blog](https://gorilla.cs.berkeley.edu/blogs/10_checker_manual.html) and [code](https://github.com/ShishirPatil/gorilla).**"
|
| 1030 |
+
)
|
| 1031 |
+
gr.Markdown(
|
| 1032 |
+
"""**AST means evaluation through Abstract Syntax Tree and Exec means evaluation through execution.**
|
| 1033 |
+
|
| 1034 |
+
**FC = native support for function/tool calling.**
|
| 1035 |
+
|
| 1036 |
+
**Cost is calculated as an estimate of the cost per 1000 function calls, in USD. Latency is measured in seconds.**
|
| 1037 |
+
|
| 1038 |
+
**AST Summary is the unweighted average of the four test categories under AST Evaluation. Exec Summary is the unweighted average of the four test categories under Exec Evaluation.**
|
| 1039 |
+
|
| 1040 |
+
**Click on column header to sort. If you would like to add your model or contribute test-cases, please contact us via [discord](https://discord.gg/SwTyuTAxX3).**
|
| 1041 |
+
"""
|
| 1042 |
+
)
|
| 1043 |
+
leaderboard_data = gr.Dataframe(value=get_summary(), wrap=True)
|
| 1044 |
+
with gr.TabItem("Full Leaderboard"):
|
| 1045 |
gr.Markdown(
|
| 1046 |
+
"**This live leaderboard evaluates the LLM's ability to call functions (aka tools) accurately. This leaderboard consists of real-world data and will be updated periodically. For more information on the evaluation dataset and methodology, please refer to our [blog](https://gorilla.cs.berkeley.edu/blogs/10_checker_manual.html) and [code](https://github.com/ShishirPatil/gorilla).**"
|
| 1047 |
)
|
| 1048 |
+
gr.Markdown(
|
| 1049 |
+
"""**AST means evaluation through Abstract Syntax Tree and Exec means evaluation through execution.**
|
| 1050 |
+
|
| 1051 |
+
**FC = native support for function/tool calling.**
|
| 1052 |
+
|
| 1053 |
+
**Cost is calculated as an estimate of the cost per 1000 function calls, in USD. Latency is measured in seconds.**
|
| 1054 |
+
|
| 1055 |
+
**AST Summary is the unweighted average of the four test categories under AST Evaluation. Exec Summary is the unweighted average of the four test categories under Exec Evaluation.**
|
| 1056 |
+
|
| 1057 |
+
**Click on column header to sort. If you would like to add your model or contribute test-cases, please contact us via [discord](https://discord.gg/SwTyuTAxX3).**
|
| 1058 |
+
"""
|
| 1059 |
)
|
| 1060 |
+
leaderboard_data = gr.Dataframe(value=get_leaderboard(), wrap=True)
|
| 1061 |
|
| 1062 |
with gr.TabItem("Try It Out"):
|
| 1063 |
with gr.Row():
|
|
|
|
| 1123 |
fn=None,
|
| 1124 |
inputs=[prompt, model, temperature, codeOutput, jsonOutput],
|
| 1125 |
outputs=[],
|
| 1126 |
+
js='(prompt, model, temperature, codeOutput, jsonOutput) => window.open(`https://github.com/ShishirPatil/gorilla/issues/new?assignees=&labels=hosted-openfunctions-v2&projects=&template=hosted-openfunctions-v2.md&title=[bug] OpenFunctions-v2: &body=**Issue Description**%0A%0APrompt: ${prompt}%0A%0AModel: ${model}%0A%0ATemperature: ${temperature}%0A%0AOutput (or Error if request failed): ${codeOutput} %0A%0A ${jsonOutput}%0A%0A**Additional Information**\n`, "_blank")',
|
| 1127 |
)
|
| 1128 |
+
|
| 1129 |
thumbs_up.click(
|
| 1130 |
fn=send_feedback_positive,
|
| 1131 |
+
inputs=[
|
| 1132 |
+
prompt,
|
| 1133 |
+
funcDescription,
|
| 1134 |
+
model,
|
| 1135 |
+
temperature,
|
| 1136 |
+
codeOutput,
|
| 1137 |
+
jsonOutput,
|
| 1138 |
+
],
|
| 1139 |
outputs=[feedbackMsg],
|
| 1140 |
)
|
| 1141 |
+
|
| 1142 |
thumbs_down.click(
|
| 1143 |
fn=send_feedback_negative,
|
| 1144 |
+
inputs=[
|
| 1145 |
+
prompt,
|
| 1146 |
+
funcDescription,
|
| 1147 |
+
model,
|
| 1148 |
+
temperature,
|
| 1149 |
+
codeOutput,
|
| 1150 |
+
jsonOutput,
|
| 1151 |
+
],
|
| 1152 |
outputs=[feedbackMsg],
|
| 1153 |
)
|
| 1154 |
|
|
|
|
| 1157 |
# leaderboard_data = gr.Dataframe(
|
| 1158 |
# value=get_voting_result(), wrap=True
|
| 1159 |
# )
|
| 1160 |
+
|
| 1161 |
demo.launch()
|