Spaces:
Runtime error
Runtime error
Commit
·
d3db3e5
1
Parent(s):
d0e8be9
remove ruff cache
Browse files- src/gen/data/arena-hard-v0.1/model_answer/external/gigachat_lite.jsonl +0 -0
- src/gen/data/arena-hard-v0.1/model_answer/external/private/var/folders/ws/s9058_gn5cs181gs2_54lcvc0000gn/T/gradio/4a99fae57971a5f7e281df57ab8739fd979a9345/16.o1.csv +0 -11
- src/gen/data/arena-hard-v0.1/model_answer/internal/gpt-3.5-turbo-0125.jsonl +0 -0
- src/gen/data/arena-hard-v0.1/model_judgement/gpt-4-1106-preview/gigachat_lite.jsonl +0 -0
- src/gen/data/arena-hard-v0.1/model_judgement/gpt-4-1106-preview/gigachat_pro.jsonl +0 -0
- src/leaderboard/build_leaderboard.py +12 -10
src/gen/data/arena-hard-v0.1/model_answer/external/gigachat_lite.jsonl
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
src/gen/data/arena-hard-v0.1/model_answer/external/private/var/folders/ws/s9058_gn5cs181gs2_54lcvc0000gn/T/gradio/4a99fae57971a5f7e281df57ab8739fd979a9345/16.o1.csv
DELETED
|
@@ -1,11 +0,0 @@
|
|
| 1 |
-
Col1.Col2.Col3.Col4.Col5.Col6.Col7.Col8.Col9.Col10
|
| 2 |
-
1.2.5.6.2.6.3.7.8.8
|
| 3 |
-
10.10.10.7.8.3.8.9.4.8
|
| 4 |
-
5.9.2.10.7.7.4.9.2.3
|
| 5 |
-
4.8.2.9.8.7.6.6.9.4
|
| 6 |
-
1.8.7.3.1.6.7.7.6.1
|
| 7 |
-
9.9.6.2.1.5.5.2.5.5
|
| 8 |
-
8.2.10.5.10.10.7.6.3.6
|
| 9 |
-
6.1.8.3.3.4.7.7.8.5
|
| 10 |
-
7.1.3.3.2.4.5.9.5.6
|
| 11 |
-
4.1.4.4.6.1.2.6.9.2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/gen/data/arena-hard-v0.1/model_answer/internal/gpt-3.5-turbo-0125.jsonl
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
src/gen/data/arena-hard-v0.1/model_judgement/gpt-4-1106-preview/gigachat_lite.jsonl
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
src/gen/data/arena-hard-v0.1/model_judgement/gpt-4-1106-preview/gigachat_pro.jsonl
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
src/leaderboard/build_leaderboard.py
CHANGED
|
@@ -19,7 +19,7 @@ def time_diff_wrapper(func):
|
|
| 19 |
result = func(*args, **kwargs)
|
| 20 |
end_time = time.time()
|
| 21 |
diff = end_time - start_time
|
| 22 |
-
logging.info(
|
| 23 |
return result
|
| 24 |
|
| 25 |
return wrapper
|
|
@@ -45,33 +45,34 @@ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, ba
|
|
| 45 |
return
|
| 46 |
except Exception as e:
|
| 47 |
wait_time = backoff_factor**attempt
|
| 48 |
-
logging.error(
|
| 49 |
time.sleep(wait_time)
|
| 50 |
attempt += 1
|
| 51 |
-
logging.error(
|
| 52 |
|
| 53 |
|
| 54 |
def build_leadearboard_df():
|
| 55 |
"""Initializes the application space, loading only necessary data."""
|
| 56 |
|
| 57 |
-
#
|
| 58 |
-
# download_dataset(DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH)
|
| 59 |
download_dataset("Vikhrmodels/openbench-eval", EVAL_RESULTS_PATH)
|
| 60 |
# print(subprocess.Popen('ls src'))
|
|
|
|
|
|
|
| 61 |
subprocess.run(
|
| 62 |
[
|
| 63 |
"rsync",
|
| 64 |
-
"-
|
| 65 |
"--ignore-existing",
|
| 66 |
-
f"{EVAL_RESULTS_PATH}/
|
| 67 |
-
"src/gen/data/arena-hard-v0.1/model_answer/",
|
| 68 |
],
|
| 69 |
check=False,
|
| 70 |
)
|
| 71 |
subprocess.run(
|
| 72 |
[
|
| 73 |
"rsync",
|
| 74 |
-
"-
|
| 75 |
"--ignore-existing",
|
| 76 |
f"{EVAL_RESULTS_PATH}/model_judgment/*",
|
| 77 |
"src/gen/data/arena-hard-v0.1/model_judgement/",
|
|
@@ -80,5 +81,6 @@ def build_leadearboard_df():
|
|
| 80 |
)
|
| 81 |
|
| 82 |
# Retrieve the leaderboard DataFrame
|
| 83 |
-
|
|
|
|
| 84 |
return leaderboard_df.copy()
|
|
|
|
| 19 |
result = func(*args, **kwargs)
|
| 20 |
end_time = time.time()
|
| 21 |
diff = end_time - start_time
|
| 22 |
+
logging.info("Time taken for %s: %s seconds", func.__name__, diff)
|
| 23 |
return result
|
| 24 |
|
| 25 |
return wrapper
|
|
|
|
| 45 |
return
|
| 46 |
except Exception as e:
|
| 47 |
wait_time = backoff_factor**attempt
|
| 48 |
+
logging.error("Error downloading %s: %s, retrying in %ss", repo_id, e, wait_time)
|
| 49 |
time.sleep(wait_time)
|
| 50 |
attempt += 1
|
| 51 |
+
logging.error("Failed to download %s after %s attempts", repo_id, max_attempts)
|
| 52 |
|
| 53 |
|
| 54 |
def build_leadearboard_df():
|
| 55 |
"""Initializes the application space, loading only necessary data."""
|
| 56 |
|
| 57 |
+
# download answers of different models that we trust
|
|
|
|
| 58 |
download_dataset("Vikhrmodels/openbench-eval", EVAL_RESULTS_PATH)
|
| 59 |
# print(subprocess.Popen('ls src'))
|
| 60 |
+
|
| 61 |
+
# copy the grusted to
|
| 62 |
subprocess.run(
|
| 63 |
[
|
| 64 |
"rsync",
|
| 65 |
+
"-azP",
|
| 66 |
"--ignore-existing",
|
| 67 |
+
f"{EVAL_RESULTS_PATH}/internal/*.jsonl",
|
| 68 |
+
"src/gen/data/arena-hard-v0.1/model_answer/interla/*",
|
| 69 |
],
|
| 70 |
check=False,
|
| 71 |
)
|
| 72 |
subprocess.run(
|
| 73 |
[
|
| 74 |
"rsync",
|
| 75 |
+
"-azP",
|
| 76 |
"--ignore-existing",
|
| 77 |
f"{EVAL_RESULTS_PATH}/model_judgment/*",
|
| 78 |
"src/gen/data/arena-hard-v0.1/model_judgement/",
|
|
|
|
| 81 |
)
|
| 82 |
|
| 83 |
# Retrieve the leaderboard DataFrame
|
| 84 |
+
with open("eval-results/evals/upd.json", "r", encoding="utf-8") as eval_file:
|
| 85 |
+
leaderboard_df = pd.DataFrame.from_records(json.load(eval_file))
|
| 86 |
return leaderboard_df.copy()
|