Spaces:
Sleeping
Sleeping
Alex
commited on
Commit
·
15c92e9
1
Parent(s):
b4d9db9
zalupa
Browse files- app.py +55 -1
- src/populate.py +35 -0
- src/submission/submit.py +130 -1
app.py
CHANGED
|
@@ -28,7 +28,7 @@ from src.display.utils import (
|
|
| 28 |
)
|
| 29 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 30 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 31 |
-
from src.submission.submit import add_new_eval
|
| 32 |
|
| 33 |
|
| 34 |
def restart_space():
|
|
@@ -190,6 +190,60 @@ with demo:
|
|
| 190 |
submission_result,
|
| 191 |
)
|
| 192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
with gr.Row():
|
| 194 |
with gr.Accordion("📙 Citation", open=False):
|
| 195 |
citation_button = gr.Textbox(
|
|
|
|
| 28 |
)
|
| 29 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 30 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 31 |
+
from src.submission.submit import add_new_eval, add_manual_results
|
| 32 |
|
| 33 |
|
| 34 |
def restart_space():
|
|
|
|
| 190 |
submission_result,
|
| 191 |
)
|
| 192 |
|
| 193 |
+
# ----------------------------------------------------
|
| 194 |
+
# Manual metrics submission form
|
| 195 |
+
# ----------------------------------------------------
|
| 196 |
+
with gr.Markdown("## 📝 Submit metrics manually (advanced)"):
|
| 197 |
+
pass
|
| 198 |
+
|
| 199 |
+
with gr.Row():
|
| 200 |
+
with gr.Column():
|
| 201 |
+
model_name_metrics = gr.Textbox(label="Model name", placeholder="org/model")
|
| 202 |
+
revision_metrics = gr.Textbox(label="Revision commit", placeholder="main", value="main")
|
| 203 |
+
bleu_input = gr.Number(label="BLEU", value=0.5)
|
| 204 |
+
pass1_input = gr.Number(label="Pass@1", value=0.5, minimum=0.0, maximum=1.0)
|
| 205 |
+
pass5_input = gr.Number(label="Pass@5", value=0.5, minimum=0.0, maximum=1.0)
|
| 206 |
+
pass10_input = gr.Number(label="Pass@10", value=0.5, minimum=0.0, maximum=1.0)
|
| 207 |
+
|
| 208 |
+
with gr.Column():
|
| 209 |
+
# Subjective metrics sliders (0-5)
|
| 210 |
+
readability_slider = gr.Slider(0, 5, step=1, value=3, label="Readability")
|
| 211 |
+
relevance_slider = gr.Slider(0, 5, step=1, value=3, label="Relevance")
|
| 212 |
+
explanation_slider = gr.Slider(0, 5, step=1, value=3, label="Explanation clarity")
|
| 213 |
+
problem_slider = gr.Slider(0, 5, step=1, value=3, label="Problem identification")
|
| 214 |
+
actionability_slider = gr.Slider(0, 5, step=1, value=3, label="Actionability")
|
| 215 |
+
completeness_slider = gr.Slider(0, 5, step=1, value=3, label="Completeness")
|
| 216 |
+
specificity_slider = gr.Slider(0, 5, step=1, value=3, label="Specificity")
|
| 217 |
+
contextual_slider = gr.Slider(0, 5, step=1, value=3, label="Contextual adequacy")
|
| 218 |
+
consistency_slider = gr.Slider(0, 5, step=1, value=3, label="Consistency")
|
| 219 |
+
brevity_slider = gr.Slider(0, 5, step=1, value=3, label="Brevity")
|
| 220 |
+
|
| 221 |
+
submit_metrics_button = gr.Button("Submit Metrics")
|
| 222 |
+
metrics_submission_result = gr.Markdown()
|
| 223 |
+
|
| 224 |
+
submit_metrics_button.click(
|
| 225 |
+
add_manual_results,
|
| 226 |
+
[
|
| 227 |
+
model_name_metrics,
|
| 228 |
+
revision_metrics,
|
| 229 |
+
bleu_input,
|
| 230 |
+
readability_slider,
|
| 231 |
+
relevance_slider,
|
| 232 |
+
explanation_slider,
|
| 233 |
+
problem_slider,
|
| 234 |
+
actionability_slider,
|
| 235 |
+
completeness_slider,
|
| 236 |
+
specificity_slider,
|
| 237 |
+
contextual_slider,
|
| 238 |
+
consistency_slider,
|
| 239 |
+
brevity_slider,
|
| 240 |
+
pass1_input,
|
| 241 |
+
pass5_input,
|
| 242 |
+
pass10_input,
|
| 243 |
+
],
|
| 244 |
+
metrics_submission_result,
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
with gr.Row():
|
| 248 |
with gr.Accordion("📙 Citation", open=False):
|
| 249 |
citation_button = gr.Textbox(
|
src/populate.py
CHANGED
|
@@ -14,6 +14,35 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
| 14 |
all_data_json = [v.to_dict() for v in raw_data]
|
| 15 |
|
| 16 |
df = pd.DataFrame.from_records(all_data_json)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
# Sort primarily by LLM exact-match Pass@1 metric; if not present, fall back to average
|
| 18 |
preferred_cols = []
|
| 19 |
if hasattr(AutoEvalColumn, "pass_at_1"):
|
|
@@ -24,6 +53,12 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
| 24 |
if col in df.columns:
|
| 25 |
df = df.sort_values(by=[col], ascending=False)
|
| 26 |
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
df = df[cols].round(decimals=2)
|
| 28 |
|
| 29 |
# filter out if any of the benchmarks have not been produced
|
|
|
|
| 14 |
all_data_json = [v.to_dict() for v in raw_data]
|
| 15 |
|
| 16 |
df = pd.DataFrame.from_records(all_data_json)
|
| 17 |
+
|
| 18 |
+
# ------------------------------------------------------------------
|
| 19 |
+
# Fallback: if no evaluation results are found we populate the
|
| 20 |
+
# leaderboard with a single example model. This guarantees that a
|
| 21 |
+
# freshly deployed Space shows a non-empty leaderboard and it serves
|
| 22 |
+
# as a template for the expected columns/values.
|
| 23 |
+
# ------------------------------------------------------------------
|
| 24 |
+
if df.empty:
|
| 25 |
+
example_row = {}
|
| 26 |
+
|
| 27 |
+
# Populate benchmark metrics with the default value 0.5
|
| 28 |
+
for metric in benchmark_cols:
|
| 29 |
+
example_row[metric] = 0.5
|
| 30 |
+
|
| 31 |
+
# Minimal metadata so that the row displays nicely
|
| 32 |
+
example_row[AutoEvalColumn.model.name] = make_clickable_model("example/model")
|
| 33 |
+
example_row[AutoEvalColumn.average.name] = 0.5
|
| 34 |
+
example_row[AutoEvalColumn.model_type_symbol.name] = "🟢"
|
| 35 |
+
example_row[AutoEvalColumn.model_type.name] = "pretrained"
|
| 36 |
+
example_row[AutoEvalColumn.precision.name] = "float16"
|
| 37 |
+
example_row[AutoEvalColumn.weight_type.name] = "Original"
|
| 38 |
+
example_row[AutoEvalColumn.still_on_hub.name] = True
|
| 39 |
+
example_row[AutoEvalColumn.architecture.name] = "Transformer"
|
| 40 |
+
example_row[AutoEvalColumn.revision.name] = "main"
|
| 41 |
+
example_row[AutoEvalColumn.license.name] = "apache-2.0"
|
| 42 |
+
|
| 43 |
+
# Any missing columns will be created later in the function
|
| 44 |
+
df = pd.DataFrame([example_row])
|
| 45 |
+
|
| 46 |
# Sort primarily by LLM exact-match Pass@1 metric; if not present, fall back to average
|
| 47 |
preferred_cols = []
|
| 48 |
if hasattr(AutoEvalColumn, "pass_at_1"):
|
|
|
|
| 53 |
if col in df.columns:
|
| 54 |
df = df.sort_values(by=[col], ascending=False)
|
| 55 |
break
|
| 56 |
+
|
| 57 |
+
# Ensure all expected columns exist, add missing ones with NaN so selection does not fail
|
| 58 |
+
for expected in cols:
|
| 59 |
+
if expected not in df.columns:
|
| 60 |
+
df[expected] = pd.NA
|
| 61 |
+
|
| 62 |
df = df[cols].round(decimals=2)
|
| 63 |
|
| 64 |
# filter out if any of the benchmarks have not been produced
|
src/submission/submit.py
CHANGED
|
@@ -3,7 +3,7 @@ import os
|
|
| 3 |
from datetime import datetime, timezone
|
| 4 |
|
| 5 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
| 6 |
-
from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
|
| 7 |
from src.submission.check_validity import (
|
| 8 |
already_submitted_models,
|
| 9 |
check_model_card,
|
|
@@ -117,3 +117,132 @@ def add_new_eval(
|
|
| 117 |
return styled_message(
|
| 118 |
"Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
|
| 119 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
from datetime import datetime, timezone
|
| 4 |
|
| 5 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
| 6 |
+
from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO, EVAL_RESULTS_PATH, RESULTS_REPO
|
| 7 |
from src.submission.check_validity import (
|
| 8 |
already_submitted_models,
|
| 9 |
check_model_card,
|
|
|
|
| 117 |
return styled_message(
|
| 118 |
"Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
|
| 119 |
)
|
| 120 |
+
|
| 121 |
+
# --------------------------------------------------------
|
| 122 |
+
# Manual metrics submission (bypass evaluation queue)
|
| 123 |
+
# --------------------------------------------------------
|
| 124 |
+
|
| 125 |
+
ALL_SUBJECTIVE_FIELDS = [
|
| 126 |
+
"readability",
|
| 127 |
+
"relevance",
|
| 128 |
+
"explanation_clarity",
|
| 129 |
+
"problem_identification",
|
| 130 |
+
"actionability",
|
| 131 |
+
"completeness",
|
| 132 |
+
"specificity",
|
| 133 |
+
"contextual_adequacy",
|
| 134 |
+
"consistency",
|
| 135 |
+
"brevity",
|
| 136 |
+
]
|
| 137 |
+
|
| 138 |
+
def _compute_multimetric(payload: dict) -> float:
|
| 139 |
+
"""Average of the 10 subjective metrics."""
|
| 140 |
+
total = sum(float(payload[f]) for f in ALL_SUBJECTIVE_FIELDS)
|
| 141 |
+
return total / len(ALL_SUBJECTIVE_FIELDS)
|
| 142 |
+
|
| 143 |
+
def add_manual_results(
|
| 144 |
+
model: str,
|
| 145 |
+
revision: str,
|
| 146 |
+
bleu: float,
|
| 147 |
+
readability: int,
|
| 148 |
+
relevance: int,
|
| 149 |
+
explanation_clarity: int,
|
| 150 |
+
problem_identification: int,
|
| 151 |
+
actionability: int,
|
| 152 |
+
completeness: int,
|
| 153 |
+
specificity: int,
|
| 154 |
+
contextual_adequacy: int,
|
| 155 |
+
consistency: int,
|
| 156 |
+
brevity: int,
|
| 157 |
+
pass_at_1: float,
|
| 158 |
+
pass_at_5: float,
|
| 159 |
+
pass_at_10: float,
|
| 160 |
+
):
|
| 161 |
+
"""Directly submit evaluation metrics for a model and push them to the results dataset."""
|
| 162 |
+
|
| 163 |
+
# Basic validation
|
| 164 |
+
if model == "":
|
| 165 |
+
return styled_error("Please specify a model name.")
|
| 166 |
+
|
| 167 |
+
if revision == "":
|
| 168 |
+
revision = "main"
|
| 169 |
+
|
| 170 |
+
if pass_at_5 < pass_at_1:
|
| 171 |
+
return styled_error("pass@5 must be greater or equal to pass@1")
|
| 172 |
+
if pass_at_10 < pass_at_5:
|
| 173 |
+
return styled_error("pass@10 must be greater or equal to pass@5")
|
| 174 |
+
|
| 175 |
+
# Prepare dictionary in the same format used by read_evals.py
|
| 176 |
+
payload_dict = {
|
| 177 |
+
"model": model,
|
| 178 |
+
"revision": revision,
|
| 179 |
+
"bleu": bleu,
|
| 180 |
+
"readability": readability,
|
| 181 |
+
"relevance": relevance,
|
| 182 |
+
"explanation_clarity": explanation_clarity,
|
| 183 |
+
"problem_identification": problem_identification,
|
| 184 |
+
"actionability": actionability,
|
| 185 |
+
"completeness": completeness,
|
| 186 |
+
"specificity": specificity,
|
| 187 |
+
"contextual_adequacy": contextual_adequacy,
|
| 188 |
+
"consistency": consistency,
|
| 189 |
+
"brevity": brevity,
|
| 190 |
+
"pass_at_1": pass_at_1,
|
| 191 |
+
"pass_at_5": pass_at_5,
|
| 192 |
+
"pass_at_10": pass_at_10,
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
multimetric = _compute_multimetric(payload_dict)
|
| 196 |
+
|
| 197 |
+
# Compose final results file (same structure as api_submit_results)
|
| 198 |
+
result_json = {
|
| 199 |
+
"config": {
|
| 200 |
+
"model_dtype": "unknown",
|
| 201 |
+
"model_name": model,
|
| 202 |
+
"model_sha": revision,
|
| 203 |
+
},
|
| 204 |
+
"results": {
|
| 205 |
+
"bleu": {"score": bleu},
|
| 206 |
+
"multimetric": {"score": multimetric},
|
| 207 |
+
"pass_at_1": {"score": pass_at_1},
|
| 208 |
+
"pass_at_5": {"score": pass_at_5},
|
| 209 |
+
"pass_at_10": {"score": pass_at_10},
|
| 210 |
+
},
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
# Add subjective metrics
|
| 214 |
+
for field in ALL_SUBJECTIVE_FIELDS:
|
| 215 |
+
result_json["results"][field] = {"score": payload_dict[field]}
|
| 216 |
+
|
| 217 |
+
# Write file locally then upload
|
| 218 |
+
try:
|
| 219 |
+
os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
|
| 220 |
+
except Exception:
|
| 221 |
+
pass
|
| 222 |
+
|
| 223 |
+
from datetime import datetime, timezone
|
| 224 |
+
import uuid
|
| 225 |
+
|
| 226 |
+
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
| 227 |
+
unique_id = uuid.uuid4().hex[:8]
|
| 228 |
+
filename = f"results_{model.replace('/', '_')}_{ts}_{unique_id}.json"
|
| 229 |
+
local_path = os.path.join(EVAL_RESULTS_PATH, filename)
|
| 230 |
+
|
| 231 |
+
try:
|
| 232 |
+
with open(local_path, "w") as fp:
|
| 233 |
+
json.dump(result_json, fp)
|
| 234 |
+
|
| 235 |
+
API.upload_file(
|
| 236 |
+
path_or_fileobj=local_path,
|
| 237 |
+
path_in_repo=filename,
|
| 238 |
+
repo_id=RESULTS_REPO,
|
| 239 |
+
repo_type="dataset",
|
| 240 |
+
commit_message=f"Add manual results for {model}",
|
| 241 |
+
)
|
| 242 |
+
except Exception as e:
|
| 243 |
+
return styled_error(f"Failed to upload results: {e}")
|
| 244 |
+
finally:
|
| 245 |
+
if os.path.exists(local_path):
|
| 246 |
+
os.remove(local_path)
|
| 247 |
+
|
| 248 |
+
return styled_message("Metrics successfully submitted! The leaderboard will refresh shortly.")
|