test-agent-2
#2
by
meghsn - opened
This view is limited to 50 files because it contains too many changes.
See the raw diff here.
- README.md +0 -1
- app.py +28 -82
- results/Bgym-GPT-3.5/README.md +1 -0
- results/Bgym-GPT-3.5/config.json +4 -0
- results/{GenericAgent-GPT-4o → Bgym-GPT-3.5}/miniwob.json +4 -4
- results/Bgym-GPT-3.5/webarena.json +16 -0
- results/{GenericAgent-GPT-o1-mini → Bgym-GPT-3.5}/workarena-l1.json +27 -13
- results/{GenericAgent-GPT-4o → Bgym-GPT-3.5}/workarena-l2.json +4 -4
- results/{GenericAgent-GPT-4o → Bgym-GPT-3.5}/workarena-l3.json +3 -3
- results/Bgym-GPT-4o-V/README.md +1 -0
- results/Bgym-GPT-4o-V/config.json +4 -0
- results/{GenericAgent-GPT-4o-mini → Bgym-GPT-4o-V}/miniwob.json +4 -4
- results/Bgym-GPT-4o-V/webarena.json +16 -0
- results/{GenericAgent-GPT-4o → Bgym-GPT-4o-V}/workarena-l1.json +4 -4
- results/{GenericAgent-GPT-4o-mini → Bgym-GPT-4o-V}/workarena-l2.json +4 -4
- results/{GenericAgent-GPT-4o-mini → Bgym-GPT-4o-V}/workarena-l3.json +3 -3
- results/Bgym-GPT-4o/README.md +1 -0
- results/Bgym-GPT-4o/config.json +4 -0
- results/Bgym-GPT-4o/miniwob.json +16 -0
- results/Bgym-GPT-4o/webarena.json +16 -0
- results/{GenericAgent-GPT-4o-mini → Bgym-GPT-4o}/workarena-l1.json +4 -4
- results/{GenericAgent-Claude-3.5-Sonnet → Bgym-GPT-4o}/workarena-l2.json +4 -4
- results/{GenericAgent-GPT-o1-mini → Bgym-GPT-4o}/workarena-l3.json +3 -3
- results/Bgym-Llama-3-70b/README.md +1 -0
- results/Bgym-Llama-3-70b/config.json +4 -0
- results/Bgym-Llama-3-70b/miniwob.json +16 -0
- results/Bgym-Llama-3-70b/webarena.json +16 -0
- results/Bgym-Llama-3-70b/workarena-l1.json +58 -0
- results/Bgym-Llama-3-70b/workarena-l2.json +16 -0
- results/{GenericAgent-Claude-3.5-Sonnet → Bgym-Llama-3-70b}/workarena-l3.json +4 -4
- results/Bgym-Mixtral-8x22b/README.md +1 -0
- results/Bgym-Mixtral-8x22b/config.json +4 -0
- results/Bgym-Mixtral-8x22b/miniwob.json +16 -0
- results/Bgym-Mixtral-8x22b/webarena.json +16 -0
- results/Bgym-Mixtral-8x22b/workarena-l1.json +44 -0
- results/Bgym-Mixtral-8x22b/workarena-l2.json +16 -0
- results/Bgym-Mixtral-8x22b/workarena-l3.json +16 -0
- results/GenericAgent-AgentTrek-1.0-32b/README.md +0 -85
- results/GenericAgent-AgentTrek-1.0-32b/miniwob.json +0 -16
- results/GenericAgent-AgentTrek-1.0-32b/webarena.json +0 -16
- results/GenericAgent-AgentTrek-1.0-32b/workarena-l1.json +0 -16
- results/GenericAgent-AgentTrek-1.0-32b/workarena-l2.json +0 -16
- results/GenericAgent-AgentTrek-1.0-32b/workarena-l3.json +0 -16
- results/GenericAgent-Claude-3.5-Sonnet/README.md +0 -46
- results/GenericAgent-Claude-3.5-Sonnet/assistantbench.json +0 -16
- results/GenericAgent-Claude-3.5-Sonnet/miniwob.json +0 -16
- results/GenericAgent-Claude-3.5-Sonnet/visualwebarena.json +0 -16
- results/GenericAgent-Claude-3.5-Sonnet/webarena.json +0 -16
- results/GenericAgent-Claude-3.5-Sonnet/weblinx.json +0 -16
- results/GenericAgent-Claude-3.5-Sonnet/workarena-l1.json +0 -16
README.md
CHANGED
|
@@ -6,7 +6,6 @@ colorTo: green
|
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
license: mit
|
| 9 |
-
short_description: Tracks perf of LLMs, VLMs and agents on web navigation tasks
|
| 10 |
---
|
| 11 |
|
| 12 |
# BrowserGym Leaderboard
|
|
|
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
license: mit
|
|
|
|
| 9 |
---
|
| 10 |
|
| 11 |
# BrowserGym Leaderboard
|
app.py
CHANGED
|
@@ -9,7 +9,6 @@ import plotly.graph_objs as go
|
|
| 9 |
from huggingface_hub import HfApi
|
| 10 |
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
|
| 11 |
import streamlit.components.v1 as components
|
| 12 |
-
from datetime import datetime
|
| 13 |
|
| 14 |
from urllib.parse import quote
|
| 15 |
from pathlib import Path
|
|
@@ -17,7 +16,7 @@ import re
|
|
| 17 |
import html
|
| 18 |
from typing import Dict, Any
|
| 19 |
|
| 20 |
-
BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena-L2", "WorkArena-L3", "MiniWoB",
|
| 21 |
|
| 22 |
def sanitize_agent_name(agent_name):
|
| 23 |
# Only allow alphanumeric chars, hyphen, underscore
|
|
@@ -44,34 +43,12 @@ def sanitize_column_name(col: str) -> str:
|
|
| 44 |
return html.escape(str(col))
|
| 45 |
|
| 46 |
def sanitize_cell_value(value: Any) -> str:
|
|
|
|
| 47 |
if isinstance(value, (int, float)):
|
| 48 |
return str(value)
|
| 49 |
-
if isinstance(value, str) and '±' in value:
|
| 50 |
-
score, std_err = value.split('±')
|
| 51 |
-
return f'{score.strip()} <span style="font-size: smaller; color: var(--lighter-color);">±{std_err.strip()}</span>'
|
| 52 |
return html.escape(str(value))
|
| 53 |
|
| 54 |
def create_html_table_main(df):
|
| 55 |
-
col1, col2 = st.columns([2,6])
|
| 56 |
-
with col1:
|
| 57 |
-
sort_column = st.selectbox("Sort by", df.columns.tolist(), index=df.columns.tolist().index("WebArena"), key="main_sort_column")
|
| 58 |
-
with col2:
|
| 59 |
-
sort_order = st.radio("Order", ["Ascending", "Descending"], index=1, horizontal=True, key="main_sort_order")
|
| 60 |
-
|
| 61 |
-
def get_sort_value(row):
|
| 62 |
-
if row == "-":
|
| 63 |
-
return float('-inf')
|
| 64 |
-
else:
|
| 65 |
-
try:
|
| 66 |
-
return float(row)
|
| 67 |
-
except ValueError:
|
| 68 |
-
return row
|
| 69 |
-
|
| 70 |
-
# Sort dataframe
|
| 71 |
-
if sort_order == "Ascending":
|
| 72 |
-
df = df.sort_values(by=sort_column, key=lambda x: x.apply(get_sort_value))
|
| 73 |
-
else:
|
| 74 |
-
df = df.sort_values(by=sort_column, ascending=False, key=lambda x: x.apply(get_sort_value))
|
| 75 |
html = '''
|
| 76 |
<style>
|
| 77 |
table {
|
|
@@ -110,28 +87,7 @@ def create_html_table_main(df):
|
|
| 110 |
html += '</div>'
|
| 111 |
return html
|
| 112 |
|
| 113 |
-
def create_html_table_benchmark(df
|
| 114 |
-
col1, col2 = st.columns([2,6])
|
| 115 |
-
with col1:
|
| 116 |
-
sort_column = st.selectbox("Sort by", df.columns.tolist(), index=df.columns.tolist().index("Score"), key=f"benchmark_sort_column_{benchmark}")
|
| 117 |
-
with col2:
|
| 118 |
-
sort_order = st.radio("Order", ["Ascending", "Descending"], index=1, horizontal=True, key=f"benchmark_sort_order_{benchmark}")
|
| 119 |
-
|
| 120 |
-
def get_sort_value(row):
|
| 121 |
-
if row == "-":
|
| 122 |
-
return float('-inf')
|
| 123 |
-
else:
|
| 124 |
-
try:
|
| 125 |
-
return float(row)
|
| 126 |
-
except ValueError:
|
| 127 |
-
return row
|
| 128 |
-
|
| 129 |
-
# Sort dataframe
|
| 130 |
-
if sort_order == "Ascending":
|
| 131 |
-
df = df.sort_values(by=sort_column, key=lambda x: x.apply(get_sort_value))
|
| 132 |
-
else:
|
| 133 |
-
df = df.sort_values(by=sort_column, ascending=False, key=lambda x: x.apply(get_sort_value))
|
| 134 |
-
|
| 135 |
html = '''
|
| 136 |
<style>
|
| 137 |
table {
|
|
@@ -155,9 +111,8 @@ def create_html_table_benchmark(df, benchmark):
|
|
| 155 |
html += '<table>'
|
| 156 |
html += '<thead><tr>'
|
| 157 |
for column in df.columns:
|
| 158 |
-
if column =
|
| 159 |
-
|
| 160 |
-
html += f'<th>{sanitize_column_name(column)}</th>'
|
| 161 |
html += '</tr></thead>'
|
| 162 |
html += '<tbody>'
|
| 163 |
for _, row in df.iterrows():
|
|
@@ -170,11 +125,8 @@ def create_html_table_benchmark(df, benchmark):
|
|
| 170 |
summary = sanitize_cell_value(row[column])
|
| 171 |
details = "<br>".join(map(sanitize_cell_value, row["Reproduced_all"]))
|
| 172 |
html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
|
| 173 |
-
elif column == "Reproduced_all"
|
| 174 |
continue
|
| 175 |
-
elif column == "Score":
|
| 176 |
-
score_with_std_err = f'{row[column]} ± {row["std_err"]}'
|
| 177 |
-
html += f'<td>{sanitize_cell_value(score_with_std_err)}</td>'
|
| 178 |
else:
|
| 179 |
html += f'<td>{sanitize_cell_value(row[column])}</td>'
|
| 180 |
html += '</tr>'
|
|
@@ -209,19 +161,6 @@ def check_sanity(agent):
|
|
| 209 |
|
| 210 |
def main():
|
| 211 |
st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide", initial_sidebar_state="expanded")
|
| 212 |
-
st.markdown("""
|
| 213 |
-
<style>
|
| 214 |
-
:root {
|
| 215 |
-
--lighter-color: #888; /* Default for light theme */
|
| 216 |
-
}
|
| 217 |
-
@media (prefers-color-scheme: dark) {
|
| 218 |
-
:root {
|
| 219 |
-
--lighter-color: #ccc; /* Default for dark theme */
|
| 220 |
-
}
|
| 221 |
-
}
|
| 222 |
-
</style>
|
| 223 |
-
""", unsafe_allow_html=True)
|
| 224 |
-
|
| 225 |
st.markdown("""
|
| 226 |
<head>
|
| 227 |
<meta http-equiv="Content-Security-Policy"
|
|
@@ -244,10 +183,7 @@ def main():
|
|
| 244 |
continue
|
| 245 |
agent_results = []
|
| 246 |
for benchmark in BENCHMARKS:
|
| 247 |
-
|
| 248 |
-
if not file_path.is_file():
|
| 249 |
-
continue
|
| 250 |
-
with open(file_path) as f:
|
| 251 |
agent_results.extend(json.load(f))
|
| 252 |
all_results[agent] = agent_results
|
| 253 |
|
|
@@ -281,9 +217,11 @@ def main():
|
|
| 281 |
if dfs_to_concat:
|
| 282 |
df = pd.concat(dfs_to_concat, ignore_index=True)
|
| 283 |
|
| 284 |
-
for
|
| 285 |
-
|
| 286 |
-
|
|
|
|
|
|
|
| 287 |
# Add a search bar
|
| 288 |
search_query = st.text_input("Search agents", "", key="search_main")
|
| 289 |
|
|
@@ -302,6 +240,14 @@ def main():
|
|
| 302 |
return ""
|
| 303 |
|
| 304 |
df['Agent'] = df['Agent'].apply(make_hyperlink)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
html_table = create_html_table_main(df)
|
| 306 |
st.markdown(html_table, unsafe_allow_html=True)
|
| 307 |
|
|
@@ -449,21 +395,18 @@ MIT
|
|
| 449 |
for value in values:
|
| 450 |
if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original":
|
| 451 |
result_dict["Score"] = value["score"]
|
| 452 |
-
result_dict["std_err"] = value["std_err"]
|
| 453 |
result_dict["Benchmark Specific"] = value["benchmark_specific"]
|
| 454 |
result_dict["Benchmark Tuned"] = value["benchmark_tuned"]
|
| 455 |
result_dict["Followed Evaluation Protocol"] = value["followed_evaluation_protocol"]
|
| 456 |
result_dict["Reproducible"] = value["reproducible"]
|
| 457 |
result_dict["Comments"] = value["comments"]
|
| 458 |
result_dict["Study ID"] = value["study_id"]
|
| 459 |
-
value["date_time"] = datetime.strptime(value["date_time"], "%Y-%m-%d %H:%M:%S").strftime("%B %d, %Y %I:%M %p")
|
| 460 |
result_dict["Date"] = value["date_time"]
|
| 461 |
result_dict["Reproduced"] = []
|
| 462 |
result_dict["Reproduced_all"] = []
|
| 463 |
flag = 1
|
| 464 |
if not flag:
|
| 465 |
result_dict["Score"] = "-"
|
| 466 |
-
result_dict["std_err"] = "-"
|
| 467 |
result_dict["Benchmark Specific"] = "-"
|
| 468 |
result_dict["Benchmark Tuned"] = "-"
|
| 469 |
result_dict["Followed Evaluation Protocol"] = "-"
|
|
@@ -475,7 +418,6 @@ MIT
|
|
| 475 |
result_dict["Reproduced_all"] = []
|
| 476 |
if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Reproduced":
|
| 477 |
result_dict["Reproduced"].append(value["score"])
|
| 478 |
-
value["date_time"] = datetime.strptime(value["date_time"], "%Y-%m-%d %H:%M:%S").strftime("%B %d, %Y %I:%M %p")
|
| 479 |
result_dict["Reproduced_all"].append(", ".join([str(value["score"]), str(value["date_time"])]))
|
| 480 |
if result_dict["Reproduced"]:
|
| 481 |
result_dict["Reproduced"] = str(min(result_dict["Reproduced"])) + " - " + str(max(result_dict["Reproduced"]))
|
|
@@ -493,10 +435,14 @@ MIT
|
|
| 493 |
# Concatenate the DataFrames
|
| 494 |
if dfs_to_concat:
|
| 495 |
df_ = pd.concat(dfs_to_concat, ignore_index=True)
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
df_
|
| 499 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 500 |
st.markdown(html_table, unsafe_allow_html=True)
|
| 501 |
|
| 502 |
|
|
|
|
| 9 |
from huggingface_hub import HfApi
|
| 10 |
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
|
| 11 |
import streamlit.components.v1 as components
|
|
|
|
| 12 |
|
| 13 |
from urllib.parse import quote
|
| 14 |
from pathlib import Path
|
|
|
|
| 16 |
import html
|
| 17 |
from typing import Dict, Any
|
| 18 |
|
| 19 |
+
BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena-L2", "WorkArena-L3", "MiniWoB",]
|
| 20 |
|
| 21 |
def sanitize_agent_name(agent_name):
|
| 22 |
# Only allow alphanumeric chars, hyphen, underscore
|
|
|
|
| 43 |
return html.escape(str(col))
|
| 44 |
|
| 45 |
def sanitize_cell_value(value: Any) -> str:
|
| 46 |
+
"""Sanitize cell values for HTML display"""
|
| 47 |
if isinstance(value, (int, float)):
|
| 48 |
return str(value)
|
|
|
|
|
|
|
|
|
|
| 49 |
return html.escape(str(value))
|
| 50 |
|
| 51 |
def create_html_table_main(df):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
html = '''
|
| 53 |
<style>
|
| 54 |
table {
|
|
|
|
| 87 |
html += '</div>'
|
| 88 |
return html
|
| 89 |
|
| 90 |
+
def create_html_table_benchmark(df):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
html = '''
|
| 92 |
<style>
|
| 93 |
table {
|
|
|
|
| 111 |
html += '<table>'
|
| 112 |
html += '<thead><tr>'
|
| 113 |
for column in df.columns:
|
| 114 |
+
if column != "Reproduced_all":
|
| 115 |
+
html += f'<th>{sanitize_column_name(column)}</th>'
|
|
|
|
| 116 |
html += '</tr></thead>'
|
| 117 |
html += '<tbody>'
|
| 118 |
for _, row in df.iterrows():
|
|
|
|
| 125 |
summary = sanitize_cell_value(row[column])
|
| 126 |
details = "<br>".join(map(sanitize_cell_value, row["Reproduced_all"]))
|
| 127 |
html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
|
| 128 |
+
elif column == "Reproduced_all":
|
| 129 |
continue
|
|
|
|
|
|
|
|
|
|
| 130 |
else:
|
| 131 |
html += f'<td>{sanitize_cell_value(row[column])}</td>'
|
| 132 |
html += '</tr>'
|
|
|
|
| 161 |
|
| 162 |
def main():
|
| 163 |
st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide", initial_sidebar_state="expanded")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
st.markdown("""
|
| 165 |
<head>
|
| 166 |
<meta http-equiv="Content-Security-Policy"
|
|
|
|
| 183 |
continue
|
| 184 |
agent_results = []
|
| 185 |
for benchmark in BENCHMARKS:
|
| 186 |
+
with open(f"results/{agent}/{benchmark.lower()}.json") as f:
|
|
|
|
|
|
|
|
|
|
| 187 |
agent_results.extend(json.load(f))
|
| 188 |
all_results[agent] = agent_results
|
| 189 |
|
|
|
|
| 217 |
if dfs_to_concat:
|
| 218 |
df = pd.concat(dfs_to_concat, ignore_index=True)
|
| 219 |
|
| 220 |
+
# df['Average'] = sum(df[column] for column in BENCHMARKS)/len(BENCHMARKS)
|
| 221 |
+
# df['Average'] = df['Average'].round(2)
|
| 222 |
+
# Sort values
|
| 223 |
+
df = df.sort_values(by='WebArena', ascending=False)
|
| 224 |
+
|
| 225 |
# Add a search bar
|
| 226 |
search_query = st.text_input("Search agents", "", key="search_main")
|
| 227 |
|
|
|
|
| 240 |
return ""
|
| 241 |
|
| 242 |
df['Agent'] = df['Agent'].apply(make_hyperlink)
|
| 243 |
+
# st.dataframe(
|
| 244 |
+
# df[['Agent'] + BENCHMARKS],
|
| 245 |
+
# use_container_width=True,
|
| 246 |
+
# column_config={benchmark: {'alignment': 'center'} for benchmark in BENCHMARKS},
|
| 247 |
+
# hide_index=True,
|
| 248 |
+
# # height=int(len(df) * 36.2),
|
| 249 |
+
# )
|
| 250 |
+
# st.markdown(df.to_html(escape=False, index=False), unsafe_allow_html=True)
|
| 251 |
html_table = create_html_table_main(df)
|
| 252 |
st.markdown(html_table, unsafe_allow_html=True)
|
| 253 |
|
|
|
|
| 395 |
for value in values:
|
| 396 |
if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original":
|
| 397 |
result_dict["Score"] = value["score"]
|
|
|
|
| 398 |
result_dict["Benchmark Specific"] = value["benchmark_specific"]
|
| 399 |
result_dict["Benchmark Tuned"] = value["benchmark_tuned"]
|
| 400 |
result_dict["Followed Evaluation Protocol"] = value["followed_evaluation_protocol"]
|
| 401 |
result_dict["Reproducible"] = value["reproducible"]
|
| 402 |
result_dict["Comments"] = value["comments"]
|
| 403 |
result_dict["Study ID"] = value["study_id"]
|
|
|
|
| 404 |
result_dict["Date"] = value["date_time"]
|
| 405 |
result_dict["Reproduced"] = []
|
| 406 |
result_dict["Reproduced_all"] = []
|
| 407 |
flag = 1
|
| 408 |
if not flag:
|
| 409 |
result_dict["Score"] = "-"
|
|
|
|
| 410 |
result_dict["Benchmark Specific"] = "-"
|
| 411 |
result_dict["Benchmark Tuned"] = "-"
|
| 412 |
result_dict["Followed Evaluation Protocol"] = "-"
|
|
|
|
| 418 |
result_dict["Reproduced_all"] = []
|
| 419 |
if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Reproduced":
|
| 420 |
result_dict["Reproduced"].append(value["score"])
|
|
|
|
| 421 |
result_dict["Reproduced_all"].append(", ".join([str(value["score"]), str(value["date_time"])]))
|
| 422 |
if result_dict["Reproduced"]:
|
| 423 |
result_dict["Reproduced"] = str(min(result_dict["Reproduced"])) + " - " + str(max(result_dict["Reproduced"]))
|
|
|
|
| 435 |
# Concatenate the DataFrames
|
| 436 |
if dfs_to_concat:
|
| 437 |
df_ = pd.concat(dfs_to_concat, ignore_index=True)
|
| 438 |
+
# st.markdown(f"<h2 id='{benchmark.lower()}'>{benchmark}</h2>", unsafe_allow_html=True)
|
| 439 |
+
# st.dataframe(
|
| 440 |
+
# df_,
|
| 441 |
+
# use_container_width=True,
|
| 442 |
+
# column_config={benchmark: {'alignment': 'center'}},
|
| 443 |
+
# hide_index=True,
|
| 444 |
+
# )
|
| 445 |
+
html_table = create_html_table_benchmark(df_)
|
| 446 |
st.markdown(html_table, unsafe_allow_html=True)
|
| 447 |
|
| 448 |
|
results/Bgym-GPT-3.5/README.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
## GPT-3.5 model
|
results/Bgym-GPT-3.5/config.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"agent_name": "GPT-3.5",
|
| 3 |
+
"backend_llm": "GPT-3.5"
|
| 4 |
+
}
|
results/{GenericAgent-GPT-4o → Bgym-GPT-3.5}/miniwob.json
RENAMED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
-
"study_id": "
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "MiniWoB",
|
| 7 |
-
"score":
|
| 8 |
-
"std_err":
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "Bgym-GPT-3.5",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "MiniWoB",
|
| 7 |
+
"score": 43.4,
|
| 8 |
+
"std_err": 0.1,
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
results/Bgym-GPT-3.5/webarena.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-GPT-3.5",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "WebArena",
|
| 7 |
+
"score": 6.7,
|
| 8 |
+
"std_err": 0.2,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/{GenericAgent-GPT-o1-mini → Bgym-GPT-3.5}/workarena-l1.json
RENAMED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
-
"study_id": "
|
| 5 |
-
"date_time": "
|
| 6 |
"benchmark": "WorkArena-L1",
|
| 7 |
-
"score":
|
| 8 |
-
"std_err":
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
|
@@ -14,17 +14,31 @@
|
|
| 14 |
"original_or_reproduced": "Original"
|
| 15 |
},
|
| 16 |
{
|
| 17 |
-
"agent_name": "
|
| 18 |
-
"study_id": "
|
| 19 |
-
"date_time": "2025-02-07 14:00:00",
|
| 20 |
"benchmark": "WorkArena-L1",
|
| 21 |
-
"score":
|
| 22 |
-
"std_err":
|
| 23 |
"benchmark_specific": "No",
|
| 24 |
"benchmark_tuned": "No",
|
| 25 |
-
"followed_evaluation_protocol": "Yes",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
"reproducible": "Yes",
|
| 27 |
-
"comments": "
|
| 28 |
-
"original_or_reproduced": "Reproduced"
|
|
|
|
| 29 |
}
|
| 30 |
]
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "Bgym-GPT-3.5",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L1",
|
| 7 |
+
"score": 6.1,
|
| 8 |
+
"std_err": 0.3,
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 14 |
"original_or_reproduced": "Original"
|
| 15 |
},
|
| 16 |
{
|
| 17 |
+
"agent_name": "Bgym-GPT-3.5",
|
| 18 |
+
"study_id": "study_id",
|
|
|
|
| 19 |
"benchmark": "WorkArena-L1",
|
| 20 |
+
"score": 5.7,
|
| 21 |
+
"std_err": 0.3,
|
| 22 |
"benchmark_specific": "No",
|
| 23 |
"benchmark_tuned": "No",
|
| 24 |
+
"followed_evaluation_protocol": "Yes",
|
| 25 |
+
"reproducible": "Yes",
|
| 26 |
+
"comments": "NA",
|
| 27 |
+
"original_or_reproduced": "Reproduced",
|
| 28 |
+
"date_time": "2021-01-04 12:06:00"
|
| 29 |
+
},
|
| 30 |
+
{
|
| 31 |
+
"benchmark": "WorkArena-L1",
|
| 32 |
+
"agent_name": "Bgym-GPT-3.5",
|
| 33 |
+
"study_id": "study_id",
|
| 34 |
+
"score": 5.1,
|
| 35 |
+
"std_err": 0.3,
|
| 36 |
+
"benchmark_specific": "No",
|
| 37 |
+
"benchmark_tuned": "No",
|
| 38 |
+
"followed_evaluation_protocol": "Yes",
|
| 39 |
"reproducible": "Yes",
|
| 40 |
+
"comments": "NA",
|
| 41 |
+
"original_or_reproduced": "Reproduced",
|
| 42 |
+
"date_time": "2021-01-04 12:06:00"
|
| 43 |
}
|
| 44 |
]
|
results/{GenericAgent-GPT-4o → Bgym-GPT-3.5}/workarena-l2.json
RENAMED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
-
"study_id": "
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L2",
|
| 7 |
-
"score":
|
| 8 |
-
"std_err":
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "Bgym-GPT-3.5",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L2",
|
| 7 |
+
"score": 0.0,
|
| 8 |
+
"std_err": 0.0,
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
results/{GenericAgent-GPT-4o → Bgym-GPT-3.5}/workarena-l3.json
RENAMED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
-
"study_id": "
|
| 5 |
-
"date_time": "
|
| 6 |
"benchmark": "WorkArena-L3",
|
| 7 |
"score": 0.0,
|
| 8 |
"std_err": 0.0,
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "Bgym-GPT-3.5",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L3",
|
| 7 |
"score": 0.0,
|
| 8 |
"std_err": 0.0,
|
results/Bgym-GPT-4o-V/README.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
## GPT-4o-V model
|
results/Bgym-GPT-4o-V/config.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"agent_name": "GPT-4o-V",
|
| 3 |
+
"backend_llm": "GPT-4o-V"
|
| 4 |
+
}
|
results/{GenericAgent-GPT-4o-mini → Bgym-GPT-4o-V}/miniwob.json
RENAMED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
-
"study_id": "
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "MiniWoB",
|
| 7 |
-
"score":
|
| 8 |
-
"std_err":
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "Bgym-GPT-4o-V",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "MiniWoB",
|
| 7 |
+
"score": 72.5,
|
| 8 |
+
"std_err": 0.5,
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
results/Bgym-GPT-4o-V/webarena.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-GPT-4o-V",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "WebArena",
|
| 7 |
+
"score": 24.0,
|
| 8 |
+
"std_err": 0.4,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/{GenericAgent-GPT-4o → Bgym-GPT-4o-V}/workarena-l1.json
RENAMED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
-
"study_id": "
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L1",
|
| 7 |
-
"score":
|
| 8 |
-
"std_err":
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "Bgym-GPT-4o-V",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L1",
|
| 7 |
+
"score": 41.8,
|
| 8 |
+
"std_err": 0.4,
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
results/{GenericAgent-GPT-4o-mini → Bgym-GPT-4o-V}/workarena-l2.json
RENAMED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
-
"study_id": "
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L2",
|
| 7 |
-
"score":
|
| 8 |
-
"std_err": 0.
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "Bgym-GPT-4o-V",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L2",
|
| 7 |
+
"score": 3.8,
|
| 8 |
+
"std_err": 0.6,
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
results/{GenericAgent-GPT-4o-mini → Bgym-GPT-4o-V}/workarena-l3.json
RENAMED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
-
"study_id": "
|
| 5 |
-
"date_time": "
|
| 6 |
"benchmark": "WorkArena-L3",
|
| 7 |
"score": 0.0,
|
| 8 |
"std_err": 0.0,
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "Bgym-GPT-4o-V",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L3",
|
| 7 |
"score": 0.0,
|
| 8 |
"std_err": 0.0,
|
results/Bgym-GPT-4o/README.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
## GPT-4o model
|
results/Bgym-GPT-4o/config.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"agent_name": "GPT-4o",
|
| 3 |
+
"backend_llm": "GPT-4o"
|
| 4 |
+
}
|
results/Bgym-GPT-4o/miniwob.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-GPT-4o",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "MiniWoB",
|
| 7 |
+
"score": 71.3,
|
| 8 |
+
"std_err": 0.5,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-GPT-4o/webarena.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-GPT-4o",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "WebArena",
|
| 7 |
+
"score": 23.5,
|
| 8 |
+
"std_err": 0.4,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/{GenericAgent-GPT-4o-mini → Bgym-GPT-4o}/workarena-l1.json
RENAMED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
-
"study_id": "
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L1",
|
| 7 |
-
"score":
|
| 8 |
-
"std_err":
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "Bgym-GPT-4o",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L1",
|
| 7 |
+
"score": 42.7,
|
| 8 |
+
"std_err": 0.4,
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
results/{GenericAgent-Claude-3.5-Sonnet → Bgym-GPT-4o}/workarena-l2.json
RENAMED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
-
"study_id": "
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L2",
|
| 7 |
-
"score":
|
| 8 |
-
"std_err":
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "Bgym-GPT-4o",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L2",
|
| 7 |
+
"score": 3.0,
|
| 8 |
+
"std_err": 0.6,
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
results/{GenericAgent-GPT-o1-mini → Bgym-GPT-4o}/workarena-l3.json
RENAMED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
-
"study_id": "
|
| 5 |
-
"date_time": "
|
| 6 |
"benchmark": "WorkArena-L3",
|
| 7 |
"score": 0.0,
|
| 8 |
"std_err": 0.0,
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "Bgym-GPT-4o",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L3",
|
| 7 |
"score": 0.0,
|
| 8 |
"std_err": 0.0,
|
results/Bgym-Llama-3-70b/README.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
### Llama-3-70B
|
results/Bgym-Llama-3-70b/config.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"agent_name": "Llama-3-70B",
|
| 3 |
+
"backend_llm": "Llama-3-70B"
|
| 4 |
+
}
|
results/Bgym-Llama-3-70b/miniwob.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-Llama-3-70b",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "MiniWoB",
|
| 7 |
+
"score": 68.2,
|
| 8 |
+
"std_err": 0.7,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-Llama-3-70b/webarena.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-Llama-3-70b",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "WebArena",
|
| 7 |
+
"score": 11.0,
|
| 8 |
+
"std_err": 0.3,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-Llama-3-70b/workarena-l1.json
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-Llama-3-70b",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"benchmark": "WorkArena-L1",
|
| 6 |
+
"score": 17.9,
|
| 7 |
+
"std_err": 0.6,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2021-01-01 12:00:00"
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"agent_name": "Bgym-Llama-3-70b",
|
| 18 |
+
"study_id": "study_id",
|
| 19 |
+
"benchmark": "WorkArena-L1",
|
| 20 |
+
"score": 15.9,
|
| 21 |
+
"std_err": 0.6,
|
| 22 |
+
"benchmark_specific": "No",
|
| 23 |
+
"benchmark_tuned": "No",
|
| 24 |
+
"followed_evaluation_protocol": "Yes",
|
| 25 |
+
"reproducible": "Yes",
|
| 26 |
+
"comments": "NA",
|
| 27 |
+
"original_or_reproduced": "Reproduced",
|
| 28 |
+
"date_time": "2021-01-04 12:06:00"
|
| 29 |
+
},
|
| 30 |
+
{
|
| 31 |
+
"agent_name": "Bgym-Llama-3-70b",
|
| 32 |
+
"study_id": "study_id",
|
| 33 |
+
"benchmark": "WorkArena-L1",
|
| 34 |
+
"score": 19.9,
|
| 35 |
+
"std_err": 0.6,
|
| 36 |
+
"benchmark_specific": "No",
|
| 37 |
+
"benchmark_tuned": "No",
|
| 38 |
+
"followed_evaluation_protocol": "Yes",
|
| 39 |
+
"reproducible": "Yes",
|
| 40 |
+
"comments": "NA",
|
| 41 |
+
"original_or_reproduced": "Reproduced",
|
| 42 |
+
"date_time": "2021-01-05 2:07:00"
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"agent_name": "Bgym-Llama-3-70b",
|
| 46 |
+
"study_id": "study_id",
|
| 47 |
+
"benchmark": "WorkArena-L1",
|
| 48 |
+
"score": 17.9,
|
| 49 |
+
"std_err": 0.6,
|
| 50 |
+
"benchmark_specific": "No",
|
| 51 |
+
"benchmark_tuned": "No",
|
| 52 |
+
"followed_evaluation_protocol": "Yes",
|
| 53 |
+
"reproducible": "Yes",
|
| 54 |
+
"comments": "NA",
|
| 55 |
+
"original_or_reproduced": "Reproduced",
|
| 56 |
+
"date_time": "2021-01-12 12:00:00"
|
| 57 |
+
}
|
| 58 |
+
]
|
results/Bgym-Llama-3-70b/workarena-l2.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-Llama-3-70b",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "WorkArena-L2",
|
| 7 |
+
"score": 0.0,
|
| 8 |
+
"std_err": 0.0,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/{GenericAgent-Claude-3.5-Sonnet → Bgym-Llama-3-70b}/workarena-l3.json
RENAMED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"agent_name": "
|
| 4 |
-
"study_id": "
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L3",
|
| 7 |
-
"score": 0.
|
| 8 |
-
"std_err": 0.
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "Bgym-Llama-3-70b",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
"date_time": "2021-01-01 12:00:00",
|
| 6 |
"benchmark": "WorkArena-L3",
|
| 7 |
+
"score": 0.0,
|
| 8 |
+
"std_err": 0.0,
|
| 9 |
"benchmark_specific": "No",
|
| 10 |
"benchmark_tuned": "No",
|
| 11 |
"followed_evaluation_protocol": "Yes",
|
results/Bgym-Mixtral-8x22b/README.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
## Mixtral 8x22B
|
results/Bgym-Mixtral-8x22b/config.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"agent_name": "Mixtral-8x22B",
|
| 3 |
+
"backend_llm": "Mixtral-8x22B"
|
| 4 |
+
}
|
results/Bgym-Mixtral-8x22b/miniwob.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-Mixtral-8x22b",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "MiniWoB",
|
| 7 |
+
"score": 62.4,
|
| 8 |
+
"std_err": 0.5,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-Mixtral-8x22b/webarena.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-Mixtral-8x22b",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "WebArena",
|
| 7 |
+
"score": 12.6,
|
| 8 |
+
"std_err": 0.9,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-Mixtral-8x22b/workarena-l1.json
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-Mixtral-8x22b",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"benchmark": "WorkArena-L1",
|
| 6 |
+
"score": 12.4,
|
| 7 |
+
"std_err": 0.7,
|
| 8 |
+
"benchmark_specific": "No",
|
| 9 |
+
"benchmark_tuned": "No",
|
| 10 |
+
"followed_evaluation_protocol": "Yes",
|
| 11 |
+
"reproducible": "Yes",
|
| 12 |
+
"comments": "NA",
|
| 13 |
+
"original_or_reproduced": "Original",
|
| 14 |
+
"date_time": "2021-01-04 12:06:00"
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"agent_name": "Bgym-Mixtral-8x22b",
|
| 18 |
+
"study_id": "study_id",
|
| 19 |
+
"benchmark": "WorkArena-L1",
|
| 20 |
+
"score": 11.4,
|
| 21 |
+
"std_err": 0.7,
|
| 22 |
+
"benchmark_specific": "No",
|
| 23 |
+
"benchmark_tuned": "No",
|
| 24 |
+
"followed_evaluation_protocol": "Yes",
|
| 25 |
+
"reproducible": "Yes",
|
| 26 |
+
"comments": "NA",
|
| 27 |
+
"original_or_reproduced": "Reproduced",
|
| 28 |
+
"date_time": "2021-01-04 12:06:00"
|
| 29 |
+
},
|
| 30 |
+
{
|
| 31 |
+
"agent_name": "Bgym-Mixtral-8x22b",
|
| 32 |
+
"study_id": "study_id",
|
| 33 |
+
"benchmark": "WorkArena-L1",
|
| 34 |
+
"score": 13.4,
|
| 35 |
+
"std_err": 0.7,
|
| 36 |
+
"benchmark_specific": "No",
|
| 37 |
+
"benchmark_tuned": "No",
|
| 38 |
+
"followed_evaluation_protocol": "Yes",
|
| 39 |
+
"reproducible": "Yes",
|
| 40 |
+
"comments": "NA",
|
| 41 |
+
"original_or_reproduced": "Reproduced",
|
| 42 |
+
"date_time": "2021-01-04 12:06:00"
|
| 43 |
+
}
|
| 44 |
+
]
|
results/Bgym-Mixtral-8x22b/workarena-l2.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-Mixtral-8x22b",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "WorkArena-L2",
|
| 7 |
+
"score": 0.0,
|
| 8 |
+
"std_err": 0.0,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/Bgym-Mixtral-8x22b/workarena-l3.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "Bgym-Mixtral-8x22b",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "WorkArena-L3",
|
| 7 |
+
"score": 0.0,
|
| 8 |
+
"std_err": 0.0,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/GenericAgent-AgentTrek-1.0-32b/README.md
DELETED
|
@@ -1,85 +0,0 @@
|
|
| 1 |
-
### GenericAgent-AgentTrek-1.0-32b
|
| 2 |
-
|
| 3 |
-
this agent is GenericAgent from Agentlab
|
| 4 |
-
|
| 5 |
-
- **Base Model:**
|
| 6 |
-
|
| 7 |
-
- Qwen/Qwen2.5-32B-Instruct
|
| 8 |
-
- **Architecture:**
|
| 9 |
-
|
| 10 |
-
- Type: Causal Language Models
|
| 11 |
-
- Training Stage: Pretraining & Post-training
|
| 12 |
-
- Architecture: transformers with RoPE, SwiGLU, RMSNorm, and Attention QKV bias
|
| 13 |
-
- Number of Parameters: 32.5B
|
| 14 |
-
- Number of Paramaters (Non-Embedding): 31.0B
|
| 15 |
-
- Number of Layers: 64
|
| 16 |
-
- Number of Attention Heads (GQA): 40 for Q and 8 for KV
|
| 17 |
-
- Input/Output Format:
|
| 18 |
-
|
| 19 |
-
- with the following flags:
|
| 20 |
-
```txt
|
| 21 |
-
flags=GenericPromptFlags(
|
| 22 |
-
obs=ObsFlags(
|
| 23 |
-
use_html=True,
|
| 24 |
-
use_ax_tree=True,
|
| 25 |
-
use_tabs=False,
|
| 26 |
-
use_focused_element=False,
|
| 27 |
-
use_error_logs=True,
|
| 28 |
-
use_history=True,
|
| 29 |
-
use_past_error_logs=False,
|
| 30 |
-
use_action_history=True,
|
| 31 |
-
use_think_history=False,
|
| 32 |
-
use_diff=False,
|
| 33 |
-
html_type='pruned_html',
|
| 34 |
-
use_screenshot=False,
|
| 35 |
-
use_som=False,
|
| 36 |
-
extract_visible_tag=False,
|
| 37 |
-
extract_clickable_tag=False,
|
| 38 |
-
extract_coords='False',
|
| 39 |
-
filter_visible_elements_only=False,
|
| 40 |
-
openai_vision_detail='auto',
|
| 41 |
-
filter_with_bid_only=False,
|
| 42 |
-
filter_som_only=False
|
| 43 |
-
),
|
| 44 |
-
action=ActionFlags(
|
| 45 |
-
action_set=HighLevelActionSetArgs(
|
| 46 |
-
subsets=('miniwob_all',),
|
| 47 |
-
multiaction=False,
|
| 48 |
-
strict=False,
|
| 49 |
-
retry_with_force=True,
|
| 50 |
-
demo_mode='off'
|
| 51 |
-
),
|
| 52 |
-
long_description=False,
|
| 53 |
-
individual_examples=False,
|
| 54 |
-
multi_actions=None,
|
| 55 |
-
is_strict=None
|
| 56 |
-
),
|
| 57 |
-
use_plan=False,
|
| 58 |
-
use_criticise=False,
|
| 59 |
-
use_thinking=True,
|
| 60 |
-
use_memory=True,
|
| 61 |
-
use_concrete_example=True,
|
| 62 |
-
use_abstract_example=True,
|
| 63 |
-
use_hints=False,
|
| 64 |
-
enable_chat=False,
|
| 65 |
-
max_prompt_tokens=40000,
|
| 66 |
-
be_cautious=True,
|
| 67 |
-
extra_instructions=None,
|
| 68 |
-
add_missparsed_messages=True,
|
| 69 |
-
max_trunc_itr=20,
|
| 70 |
-
flag_group=None
|
| 71 |
-
)
|
| 72 |
-
```
|
| 73 |
-
- Training Details
|
| 74 |
-
|
| 75 |
-
- Dataset used: [AgentTrek-6K](https://agenttrek.github.io)
|
| 76 |
-
- Number of training steps: 3 Epochs
|
| 77 |
-
- Paper Link:
|
| 78 |
-
|
| 79 |
-
- https://arxiv.org/abs/2412.09605
|
| 80 |
-
- Code Repository:
|
| 81 |
-
|
| 82 |
-
- https://agenttrek.github.io
|
| 83 |
-
- Lisense:
|
| 84 |
-
|
| 85 |
-
- apache2.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/GenericAgent-AgentTrek-1.0-32b/miniwob.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "GenericAgent-AgentTrek-1.0-32b",
|
| 4 |
-
"study_id": "4c636aa0-ea52-429d-9d7e-301b7bf0ac74",
|
| 5 |
-
"date_time": "2025-01-22 04:27:37",
|
| 6 |
-
"benchmark": "MiniWoB",
|
| 7 |
-
"score": 60.0,
|
| 8 |
-
"std_err": 2.0,
|
| 9 |
-
"benchmark_specific": "No",
|
| 10 |
-
"benchmark_tuned": "No",
|
| 11 |
-
"followed_evaluation_protocol": "Yes",
|
| 12 |
-
"reproducible": "Yes",
|
| 13 |
-
"comments": "Additional details",
|
| 14 |
-
"original_or_reproduced": "Original"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/GenericAgent-AgentTrek-1.0-32b/webarena.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "GenericAgent-AgentTrek-1.0-32b",
|
| 4 |
-
"study_id": "ac309635-f3fd-417e-ac16-1e0fc943a54f",
|
| 5 |
-
"date_time": "2025-01-25 10:16:41",
|
| 6 |
-
"benchmark": "WebArena",
|
| 7 |
-
"score": 22.4,
|
| 8 |
-
"std_err": 1.5,
|
| 9 |
-
"benchmark_specific": "No",
|
| 10 |
-
"benchmark_tuned": "No",
|
| 11 |
-
"followed_evaluation_protocol": "Yes",
|
| 12 |
-
"reproducible": "Yes",
|
| 13 |
-
"comments": "Additional details",
|
| 14 |
-
"original_or_reproduced": "Original"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/GenericAgent-AgentTrek-1.0-32b/workarena-l1.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "GenericAgent-AgentTrek-1.0-32b",
|
| 4 |
-
"study_id": "ed14232c-cd7e-4708-b334-ebaf1f220000",
|
| 5 |
-
"date_time": "2025-01-12 00:37:04",
|
| 6 |
-
"benchmark": "WorkArena-L1",
|
| 7 |
-
"score": 38.29,
|
| 8 |
-
"std_err": 2.70,
|
| 9 |
-
"benchmark_specific": "No",
|
| 10 |
-
"benchmark_tuned": "No",
|
| 11 |
-
"followed_evaluation_protocol": "Yes",
|
| 12 |
-
"reproducible": "Yes",
|
| 13 |
-
"comments": "Additional details",
|
| 14 |
-
"original_or_reproduced": "Original"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/GenericAgent-AgentTrek-1.0-32b/workarena-l2.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "GenericAgent-AgentTrek-1.0-32b",
|
| 4 |
-
"study_id": "957fb895-8548-46f4-92f0-5de6be7ceb61",
|
| 5 |
-
"date_time": "2025-01-12 09:39:21",
|
| 6 |
-
"benchmark": "WorkArena-L2",
|
| 7 |
-
"score": 2.98,
|
| 8 |
-
"std_err": 1.10,
|
| 9 |
-
"benchmark_specific": "No",
|
| 10 |
-
"benchmark_tuned": "No",
|
| 11 |
-
"followed_evaluation_protocol": "Yes",
|
| 12 |
-
"reproducible": "Yes",
|
| 13 |
-
"comments": "Additional details",
|
| 14 |
-
"original_or_reproduced": "Original"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/GenericAgent-AgentTrek-1.0-32b/workarena-l3.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "GenericAgent-AgentTrek-1.0-32b",
|
| 4 |
-
"study_id": "a951b33f-d118-4cf4-a2ef-cc2ef204eeb0",
|
| 5 |
-
"date_time": "2025-01-13 12:11:45",
|
| 6 |
-
"benchmark": "WorkArena-L3",
|
| 7 |
-
"score": 0.0,
|
| 8 |
-
"std_err": 0.0,
|
| 9 |
-
"benchmark_specific": "No",
|
| 10 |
-
"benchmark_tuned": "No",
|
| 11 |
-
"followed_evaluation_protocol": "Yes",
|
| 12 |
-
"reproducible": "Yes",
|
| 13 |
-
"comments": "Additional details",
|
| 14 |
-
"original_or_reproduced": "Original"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/GenericAgent-Claude-3.5-Sonnet/README.md
DELETED
|
@@ -1,46 +0,0 @@
|
|
| 1 |
-
### GenericAgent-Claude-3.5-Sonnet
|
| 2 |
-
|
| 3 |
-
This agent is [GenericAgent](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/generic_agent.py) from [AgentLab](https://github.com/ServiceNow/AgentLab)
|
| 4 |
-
|
| 5 |
-
It uses Claude-3.5-sonnet as a backend, with the following [flags](https://github.com/ServiceNow/AgentLab/blob/main/src/agentlab/agents/generic_agent/tmlr_config.py):
|
| 6 |
-
```python
|
| 7 |
-
BASE_FLAGS = GenericPromptFlags(
|
| 8 |
-
obs=dp.ObsFlags(
|
| 9 |
-
use_html=False,
|
| 10 |
-
use_ax_tree=True,
|
| 11 |
-
use_focused_element=True,
|
| 12 |
-
use_error_logs=True,
|
| 13 |
-
use_history=True,
|
| 14 |
-
use_past_error_logs=False,
|
| 15 |
-
use_action_history=True,
|
| 16 |
-
use_think_history=True, # gpt-4o config except for this line
|
| 17 |
-
use_diff=False,
|
| 18 |
-
html_type="pruned_html",
|
| 19 |
-
use_screenshot=False,
|
| 20 |
-
use_som=False,
|
| 21 |
-
extract_visible_tag=True,
|
| 22 |
-
extract_clickable_tag=True,
|
| 23 |
-
extract_coords="False",
|
| 24 |
-
filter_visible_elements_only=False,
|
| 25 |
-
),
|
| 26 |
-
action=dp.ActionFlags(
|
| 27 |
-
multi_actions=False,
|
| 28 |
-
action_set="bid",
|
| 29 |
-
long_description=False,
|
| 30 |
-
individual_examples=False,
|
| 31 |
-
),
|
| 32 |
-
use_plan=False,
|
| 33 |
-
use_criticise=False,
|
| 34 |
-
use_thinking=True,
|
| 35 |
-
use_memory=False,
|
| 36 |
-
use_concrete_example=True,
|
| 37 |
-
use_abstract_example=True,
|
| 38 |
-
use_hints=True,
|
| 39 |
-
enable_chat=False,
|
| 40 |
-
max_prompt_tokens=40_000,
|
| 41 |
-
be_cautious=True,
|
| 42 |
-
extra_instructions=None,
|
| 43 |
-
)
|
| 44 |
-
```
|
| 45 |
-
|
| 46 |
-
Note: Agents don't use vision except for VisualWebArena, where the vision flag is turned on (and the LLM suports it).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/GenericAgent-Claude-3.5-Sonnet/assistantbench.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
| 4 |
-
"study_id": "d93a2398-2b70-41ce-b989-364fed988d73",
|
| 5 |
-
"benchmark": "AssistantBench",
|
| 6 |
-
"score": 5.2,
|
| 7 |
-
"std_err": 1.5,
|
| 8 |
-
"benchmark_specific": "No",
|
| 9 |
-
"benchmark_tuned": "No",
|
| 10 |
-
"followed_evaluation_protocol": "Yes",
|
| 11 |
-
"reproducible": "Yes",
|
| 12 |
-
"comments": "Intersection of finished tasks across agents.",
|
| 13 |
-
"original_or_reproduced": "Original",
|
| 14 |
-
"date_time": "2024-11-28 19:34:58"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/GenericAgent-Claude-3.5-Sonnet/miniwob.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
| 4 |
-
"study_id": "2024-10-25_06-08-16",
|
| 5 |
-
"benchmark": "MiniWoB",
|
| 6 |
-
"score": 69.8,
|
| 7 |
-
"std_err": 1.8,
|
| 8 |
-
"benchmark_specific": "No",
|
| 9 |
-
"benchmark_tuned": "No",
|
| 10 |
-
"followed_evaluation_protocol": "Yes",
|
| 11 |
-
"reproducible": "Yes",
|
| 12 |
-
"comments": "NA",
|
| 13 |
-
"original_or_reproduced": "Original",
|
| 14 |
-
"date_time": "2021-01-01 12:00:00"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/GenericAgent-Claude-3.5-Sonnet/visualwebarena.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
| 4 |
-
"study_id": "22f0611d-aeea-4ee9-a533-b45442b5e080",
|
| 5 |
-
"benchmark": "VisualWebArena",
|
| 6 |
-
"score": 21.0,
|
| 7 |
-
"std_err": 1.3,
|
| 8 |
-
"benchmark_specific": "No",
|
| 9 |
-
"benchmark_tuned": "No",
|
| 10 |
-
"followed_evaluation_protocol": "Yes",
|
| 11 |
-
"reproducible": "Yes",
|
| 12 |
-
"comments": "NA",
|
| 13 |
-
"original_or_reproduced": "Original",
|
| 14 |
-
"date_time": "2024-12-02 09:11:35"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/GenericAgent-Claude-3.5-Sonnet/webarena.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
| 4 |
-
"study_id": "b5fc5be7-54cc-4fc1-a9ee-73447b9c3eae",
|
| 5 |
-
"benchmark": "WebArena",
|
| 6 |
-
"score": 36.2,
|
| 7 |
-
"std_err": 1.7,
|
| 8 |
-
"benchmark_specific": "No",
|
| 9 |
-
"benchmark_tuned": "No",
|
| 10 |
-
"followed_evaluation_protocol": "Yes",
|
| 11 |
-
"reproducible": "Yes",
|
| 12 |
-
"comments": "NA",
|
| 13 |
-
"original_or_reproduced": "Original",
|
| 14 |
-
"date_time": "2024-11-29 22:37:46"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/GenericAgent-Claude-3.5-Sonnet/weblinx.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
| 4 |
-
"study_id": "b9451759-4f0e-492c-a3c8-fa5109d2d9b1",
|
| 5 |
-
"benchmark": "WebLINX",
|
| 6 |
-
"score": 13.7,
|
| 7 |
-
"std_err": 0.6,
|
| 8 |
-
"benchmark_specific": "No",
|
| 9 |
-
"benchmark_tuned": "No",
|
| 10 |
-
"followed_evaluation_protocol": "Yes",
|
| 11 |
-
"reproducible": "Yes",
|
| 12 |
-
"comments": "NA",
|
| 13 |
-
"original_or_reproduced": "Original",
|
| 14 |
-
"date_time": "2024-11-07 21:42:30"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results/GenericAgent-Claude-3.5-Sonnet/workarena-l1.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"agent_name": "GenericAgent-Claude-3.5-Sonnet",
|
| 4 |
-
"study_id": "2024-10-23_14-17-40",
|
| 5 |
-
"benchmark": "WorkArena-L1",
|
| 6 |
-
"score": 56.4,
|
| 7 |
-
"std_err": 2.7,
|
| 8 |
-
"benchmark_specific": "No",
|
| 9 |
-
"benchmark_tuned": "No",
|
| 10 |
-
"followed_evaluation_protocol": "Yes",
|
| 11 |
-
"reproducible": "Yes",
|
| 12 |
-
"comments": "NA",
|
| 13 |
-
"original_or_reproduced": "Original",
|
| 14 |
-
"date_time": "2021-01-01 12:00:00"
|
| 15 |
-
}
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|