Spaces:
Runtime error
Runtime error
Commit ·
c7c971e
1
Parent(s): e350426
Strange system with file for persistent check
Browse files- app.py +46 -15
- src/envs.py +5 -0
- src/gen/show_result.py +3 -3
- src/leaderboard/build_leaderboard.py +2 -2
app.py
CHANGED
|
@@ -16,7 +16,16 @@ from src.display.utils import (
|
|
| 16 |
AutoEvalColumn,
|
| 17 |
fields,
|
| 18 |
)
|
| 19 |
-
from src.envs import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
from src.leaderboard.build_leaderboard import build_leadearboard_df, download_openbench
|
| 21 |
|
| 22 |
os.environ["GRADIO_ANALYTICS_ENABLED"] = "false"
|
|
@@ -30,10 +39,6 @@ enable_space_ci()
|
|
| 30 |
download_openbench()
|
| 31 |
|
| 32 |
|
| 33 |
-
def restart_space():
|
| 34 |
-
API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
|
| 35 |
-
|
| 36 |
-
|
| 37 |
def build_demo():
|
| 38 |
demo = gr.Blocks(title="Chatbot Arena Leaderboard", css=custom_css)
|
| 39 |
leaderboard_df = build_leadearboard_df()
|
|
@@ -80,6 +85,14 @@ def build_demo():
|
|
| 80 |
repo_id="Vikhrmodels/openbench-eval",
|
| 81 |
repo_type="dataset",
|
| 82 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
os.environ[RESET_JUDGEMENT_ENV] = "1"
|
| 84 |
return file.name
|
| 85 |
|
|
@@ -98,24 +111,42 @@ def build_demo():
|
|
| 98 |
|
| 99 |
|
| 100 |
def update_board():
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
need_reset = os.environ.get(RESET_JUDGEMENT_ENV)
|
| 102 |
logging.info("Updating the judgement: %s", need_reset)
|
| 103 |
if need_reset != "1":
|
| 104 |
return
|
| 105 |
os.environ[RESET_JUDGEMENT_ENV] = "0"
|
| 106 |
-
|
| 107 |
-
# gen_judgement_file = os.path.join(HF_HOME, "src/gen/gen_judgement.py")
|
| 108 |
-
# subprocess.run(["python3", gen_judgement_file], check=True)
|
| 109 |
-
|
| 110 |
-
show_result_file = os.path.join(HF_HOME, "src/gen/show_result.py")
|
| 111 |
-
subprocess.run(["python3", show_result_file, "--output"], check=True)
|
| 112 |
-
|
| 113 |
-
# update the gr item with leaderboard
|
| 114 |
-
# TODO
|
| 115 |
|
| 116 |
|
| 117 |
if __name__ == "__main__":
|
| 118 |
-
os.environ[RESET_JUDGEMENT_ENV] = "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
scheduler = BackgroundScheduler()
|
| 121 |
scheduler.add_job(update_board, "interval", minutes=10)
|
|
|
|
| 16 |
AutoEvalColumn,
|
| 17 |
fields,
|
| 18 |
)
|
| 19 |
+
from src.envs import (
|
| 20 |
+
API,
|
| 21 |
+
H4_TOKEN,
|
| 22 |
+
HF_HOME,
|
| 23 |
+
METAINFO_DATASET,
|
| 24 |
+
PERSISTENT_FILE_CHECK,
|
| 25 |
+
PERSISTENT_FILE_CHECK_PATH,
|
| 26 |
+
REPO_ID,
|
| 27 |
+
RESET_JUDGEMENT_ENV,
|
| 28 |
+
)
|
| 29 |
from src.leaderboard.build_leaderboard import build_leadearboard_df, download_openbench
|
| 30 |
|
| 31 |
os.environ["GRADIO_ANALYTICS_ENABLED"] = "false"
|
|
|
|
| 39 |
download_openbench()
|
| 40 |
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
def build_demo():
|
| 43 |
demo = gr.Blocks(title="Chatbot Arena Leaderboard", css=custom_css)
|
| 44 |
leaderboard_df = build_leadearboard_df()
|
|
|
|
| 85 |
repo_id="Vikhrmodels/openbench-eval",
|
| 86 |
repo_type="dataset",
|
| 87 |
)
|
| 88 |
+
with open(PERSISTENT_FILE_CHECK_PATH, "w", encoding="utf-8") as f:
|
| 89 |
+
f.write("1")
|
| 90 |
+
API.upload_file(
|
| 91 |
+
path_or_fileobj=PERSISTENT_FILE_CHECK,
|
| 92 |
+
path_in_repo="",
|
| 93 |
+
repo_id=METAINFO_DATASET,
|
| 94 |
+
repo_type="dataset",
|
| 95 |
+
)
|
| 96 |
os.environ[RESET_JUDGEMENT_ENV] = "1"
|
| 97 |
return file.name
|
| 98 |
|
|
|
|
| 111 |
|
| 112 |
|
| 113 |
def update_board():
|
| 114 |
+
# very shitty solution, where we update board only when needed
|
| 115 |
+
# the state is checked by the the file PERSISTENT_FILE_CHECK
|
| 116 |
+
# very bad solution
|
| 117 |
+
# but a fast one to code
|
| 118 |
need_reset = os.environ.get(RESET_JUDGEMENT_ENV)
|
| 119 |
logging.info("Updating the judgement: %s", need_reset)
|
| 120 |
if need_reset != "1":
|
| 121 |
return
|
| 122 |
os.environ[RESET_JUDGEMENT_ENV] = "0"
|
| 123 |
+
API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
|
| 126 |
if __name__ == "__main__":
|
| 127 |
+
os.environ[RESET_JUDGEMENT_ENV] = "0"
|
| 128 |
+
|
| 129 |
+
need_recalc = False
|
| 130 |
+
try:
|
| 131 |
+
with open(PERSISTENT_FILE_CHECK_PATH, "r", encoding="utf-8") as f:
|
| 132 |
+
need_recalc = f.read() == "1"
|
| 133 |
+
with open(PERSISTENT_FILE_CHECK_PATH, "w", encoding="utf-8") as f:
|
| 134 |
+
f.write("0")
|
| 135 |
+
except FileNotFoundError:
|
| 136 |
+
pass
|
| 137 |
+
|
| 138 |
+
if need_recalc:
|
| 139 |
+
API.upload_file(
|
| 140 |
+
path_or_fileobj=PERSISTENT_FILE_CHECK,
|
| 141 |
+
path_in_repo="",
|
| 142 |
+
repo_id=METAINFO_DATASET,
|
| 143 |
+
repo_type="dataset",
|
| 144 |
+
)
|
| 145 |
+
# gen_judgement_file = os.path.join(HF_HOME, "src/gen/gen_judgement.py")
|
| 146 |
+
# subprocess.run(["python3", gen_judgement_file], check=True)
|
| 147 |
+
|
| 148 |
+
show_result_file = os.path.join(HF_HOME, "src/gen/show_result.py")
|
| 149 |
+
subprocess.run(["python3", show_result_file, "--output"], check=True)
|
| 150 |
|
| 151 |
scheduler = BackgroundScheduler()
|
| 152 |
scheduler.add_job(update_board, "interval", minutes=10)
|
src/envs.py
CHANGED
|
@@ -35,6 +35,11 @@ RESET_JUDGEMENT_ENV = "RESET_JUDGEMENT"
|
|
| 35 |
|
| 36 |
API = HfApi(token=H4_TOKEN)
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
# useless env
|
| 39 |
EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "data/eval-queue")
|
| 40 |
PATH_TO_COLLECTION = "open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03"
|
|
|
|
| 35 |
|
| 36 |
API = HfApi(token=H4_TOKEN)
|
| 37 |
|
| 38 |
+
PERSISTENT_FILE_CHECK = "persistent_file_check"
|
| 39 |
+
PERSISTENT_FILE_CHECK_PATH = f"{DATA_PATH}/{PERSISTENT_FILE_CHECK}"
|
| 40 |
+
|
| 41 |
+
METAINFO_DATASET = "Vikhrmodels/arena-leaderboard-metainfo"
|
| 42 |
+
|
| 43 |
# useless env
|
| 44 |
EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "data/eval-queue")
|
| 45 |
PATH_TO_COLLECTION = "open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03"
|
src/gen/show_result.py
CHANGED
|
@@ -12,7 +12,7 @@ from sklearn.linear_model import LogisticRegression
|
|
| 12 |
from tqdm import tqdm
|
| 13 |
from utils import load_model_answers
|
| 14 |
|
| 15 |
-
from src.envs import HF_TOKEN_PRIVATE
|
| 16 |
|
| 17 |
|
| 18 |
def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
|
|
@@ -265,7 +265,7 @@ if __name__ == "__main__":
|
|
| 265 |
huggingface_hub.HfApi().upload_file(
|
| 266 |
path_or_fileobj=json_file_name,
|
| 267 |
path_in_repo="leaderboard.json",
|
| 268 |
-
repo_id=
|
| 269 |
repo_type="dataset",
|
| 270 |
token=HF_TOKEN_PRIVATE,
|
| 271 |
)
|
|
@@ -273,7 +273,7 @@ if __name__ == "__main__":
|
|
| 273 |
huggingface_hub.HfApi().upload_file(
|
| 274 |
path_or_fileobj=json_file_name,
|
| 275 |
path_in_repo=f"leaderboard_logs/{json_file_name}",
|
| 276 |
-
repo_id=
|
| 277 |
repo_type="dataset",
|
| 278 |
token=HF_TOKEN_PRIVATE,
|
| 279 |
)
|
|
|
|
| 12 |
from tqdm import tqdm
|
| 13 |
from utils import load_model_answers
|
| 14 |
|
| 15 |
+
from src.envs import HF_TOKEN_PRIVATE, METAINFO_DATASET
|
| 16 |
|
| 17 |
|
| 18 |
def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
|
|
|
|
| 265 |
huggingface_hub.HfApi().upload_file(
|
| 266 |
path_or_fileobj=json_file_name,
|
| 267 |
path_in_repo="leaderboard.json",
|
| 268 |
+
repo_id=METAINFO_DATASET,
|
| 269 |
repo_type="dataset",
|
| 270 |
token=HF_TOKEN_PRIVATE,
|
| 271 |
)
|
|
|
|
| 273 |
huggingface_hub.HfApi().upload_file(
|
| 274 |
path_or_fileobj=json_file_name,
|
| 275 |
path_in_repo=f"leaderboard_logs/{json_file_name}",
|
| 276 |
+
repo_id=METAINFO_DATASET,
|
| 277 |
repo_type="dataset",
|
| 278 |
token=HF_TOKEN_PRIVATE,
|
| 279 |
)
|
src/leaderboard/build_leaderboard.py
CHANGED
|
@@ -6,7 +6,7 @@ import time
|
|
| 6 |
import pandas as pd
|
| 7 |
from huggingface_hub import snapshot_download
|
| 8 |
|
| 9 |
-
from src.envs import DATA_ARENA_PATH, DATA_PATH, HF_TOKEN_PRIVATE
|
| 10 |
|
| 11 |
# Configure logging
|
| 12 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
|
@@ -53,7 +53,7 @@ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, ba
|
|
| 53 |
|
| 54 |
def download_openbench():
|
| 55 |
# download prev autogenerated leaderboard files
|
| 56 |
-
download_dataset(
|
| 57 |
|
| 58 |
# download answers of different models that we trust
|
| 59 |
download_dataset("Vikhrmodels/openbench-eval", DATA_ARENA_PATH)
|
|
|
|
| 6 |
import pandas as pd
|
| 7 |
from huggingface_hub import snapshot_download
|
| 8 |
|
| 9 |
+
from src.envs import DATA_ARENA_PATH, DATA_PATH, HF_TOKEN_PRIVATE, METAINFO_DATASET
|
| 10 |
|
| 11 |
# Configure logging
|
| 12 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
|
|
|
| 53 |
|
| 54 |
def download_openbench():
|
| 55 |
# download prev autogenerated leaderboard files
|
| 56 |
+
download_dataset(METAINFO_DATASET, DATA_PATH)
|
| 57 |
|
| 58 |
# download answers of different models that we trust
|
| 59 |
download_dataset("Vikhrmodels/openbench-eval", DATA_ARENA_PATH)
|