Spaces:
Sleeping
Sleeping
Commit ·
c50d20c
1
Parent(s): 72a9f71
add columns
Browse files- app.py +6 -5
- src/display/css_html_js.py +17 -0
- src/display/utils.py +17 -2
- src/leaderboard/read_evals.py +53 -2
- src/populate.py +5 -3
app.py
CHANGED
|
@@ -60,16 +60,17 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
|
|
| 60 |
def init_leaderboard(dataframe):
|
| 61 |
if dataframe is None or dataframe.empty:
|
| 62 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
|
|
|
| 63 |
return Leaderboard(
|
| 64 |
value=dataframe,
|
| 65 |
-
datatype=[c.type for c in
|
| 66 |
select_columns=SelectColumns(
|
| 67 |
-
default_selection=[c.name for c in
|
| 68 |
-
cant_deselect=[c.name for c in
|
| 69 |
label="Select Columns to Display:",
|
| 70 |
),
|
| 71 |
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
| 72 |
-
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
| 73 |
filter_columns=[
|
| 74 |
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
| 75 |
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
|
@@ -201,4 +202,4 @@ with demo:
|
|
| 201 |
scheduler = BackgroundScheduler()
|
| 202 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 203 |
scheduler.start()
|
| 204 |
-
demo.queue(default_concurrency_limit=40).launch()
|
|
|
|
| 60 |
def init_leaderboard(dataframe):
|
| 61 |
if dataframe is None or dataframe.empty:
|
| 62 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 63 |
+
visible_columns = [c for c in fields(AutoEvalColumn) if not c.hidden]
|
| 64 |
return Leaderboard(
|
| 65 |
value=dataframe,
|
| 66 |
+
datatype=[c.type for c in visible_columns],
|
| 67 |
select_columns=SelectColumns(
|
| 68 |
+
default_selection=[c.name for c in visible_columns if c.displayed_by_default],
|
| 69 |
+
cant_deselect=[c.name for c in visible_columns if c.never_hidden],
|
| 70 |
label="Select Columns to Display:",
|
| 71 |
),
|
| 72 |
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
| 73 |
+
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden and c.name in dataframe.columns],
|
| 74 |
filter_columns=[
|
| 75 |
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
| 76 |
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
|
|
|
| 202 |
scheduler = BackgroundScheduler()
|
| 203 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 204 |
scheduler.start()
|
| 205 |
+
demo.queue(default_concurrency_limit=40).launch()
|
src/display/css_html_js.py
CHANGED
|
@@ -103,3 +103,20 @@ get_window_url_params = """
|
|
| 103 |
return url_params;
|
| 104 |
}
|
| 105 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
return url_params;
|
| 104 |
}
|
| 105 |
"""
|
| 106 |
+
|
| 107 |
+
# src/display/css_html_js.py
|
| 108 |
+
|
| 109 |
+
custom_css = """
|
| 110 |
+
/* ... (کدهای قبلی بدون تغییر باقی بمانند) ... */
|
| 111 |
+
|
| 112 |
+
#box-filter > .form{
|
| 113 |
+
border: 0
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
/* --- کد جدید برای شکستن خط در هدر جدول --- */
|
| 117 |
+
th {
|
| 118 |
+
white-space: pre-wrap !important;
|
| 119 |
+
text-align: center !important;
|
| 120 |
+
vertical-align: bottom !important;
|
| 121 |
+
}
|
| 122 |
+
"""
|
src/display/utils.py
CHANGED
|
@@ -20,6 +20,19 @@ class ColumnContent:
|
|
| 20 |
hidden: bool = False
|
| 21 |
never_hidden: bool = False
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
## Leaderboard columns
|
| 24 |
auto_eval_column_dict = []
|
| 25 |
# Init
|
|
@@ -27,8 +40,11 @@ auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent(
|
|
| 27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 28 |
#Scores
|
| 29 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
|
|
|
|
|
|
|
|
|
| 30 |
for task in Tasks:
|
| 31 |
-
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
| 32 |
# Model information
|
| 33 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
| 34 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
|
@@ -107,4 +123,3 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
|
| 107 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 108 |
|
| 109 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
| 110 |
-
|
|
|
|
| 20 |
hidden: bool = False
|
| 21 |
never_hidden: bool = False
|
| 22 |
|
| 23 |
+
# Aggregated metrics displayed alongside the global average
|
| 24 |
+
ADDITIONAL_SCORE_SPECS = [
|
| 25 |
+
("accent_oriented", "Accent\n(SER|WER|SW-WER)"),
|
| 26 |
+
("acoustic_env_oriented", "Acoustic\n(SER|WER|SW-WER)"),
|
| 27 |
+
("age_oriented", "Age\n(SER|WER|SW-WER)"),
|
| 28 |
+
("formality_oriented", "Formality\n(SER|WER|SW-WER)"),
|
| 29 |
+
("gender_oriented", "Gender\n(SER|WER|SW-WER)"),
|
| 30 |
+
("num_of_speaker_oriented", "#Speakers\n(SER|WER|SW-WER)"),
|
| 31 |
+
("spontaneous_oriented", "Spontaneous\n(SER|WER|SW-WER)"),
|
| 32 |
+
]
|
| 33 |
+
ADDITIONAL_SCORE_FIELDS = [name for name, _ in ADDITIONAL_SCORE_SPECS]
|
| 34 |
+
ADDITIONAL_SCORE_SOURCE_KEYS = {name: [name.replace("_", "-"), name] for name in ADDITIONAL_SCORE_FIELDS}
|
| 35 |
+
|
| 36 |
## Leaderboard columns
|
| 37 |
auto_eval_column_dict = []
|
| 38 |
# Init
|
|
|
|
| 40 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 41 |
#Scores
|
| 42 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
| 43 |
+
for field_name, display_name in ADDITIONAL_SCORE_SPECS:
|
| 44 |
+
auto_eval_column_dict.append([field_name, ColumnContent, ColumnContent(display_name, "number", True)])
|
| 45 |
+
# Hide task-specific metrics from the selector; only expose the aggregate.
|
| 46 |
for task in Tasks:
|
| 47 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", False, True)])
|
| 48 |
# Model information
|
| 49 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
| 50 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
|
|
|
| 123 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 124 |
|
| 125 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
|
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -2,16 +2,60 @@ import glob
|
|
| 2 |
import json
|
| 3 |
import math
|
| 4 |
import os
|
| 5 |
-
from dataclasses import dataclass
|
| 6 |
|
| 7 |
import dateutil
|
| 8 |
import numpy as np
|
| 9 |
|
| 10 |
from src.display.formatting import make_clickable_model
|
| 11 |
-
from src.display.utils import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
from src.submission.check_validity import is_model_on_hub
|
| 13 |
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
@dataclass
|
| 16 |
class EvalResult:
|
| 17 |
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|
|
@@ -22,6 +66,7 @@ class EvalResult:
|
|
| 22 |
model: str
|
| 23 |
revision: str # commit hash, "" if main
|
| 24 |
results: dict
|
|
|
|
| 25 |
precision: Precision = Precision.Unknown
|
| 26 |
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
| 27 |
weight_type: WeightType = WeightType.Original # Original or Adapter
|
|
@@ -78,6 +123,7 @@ class EvalResult:
|
|
| 78 |
|
| 79 |
mean_acc = np.mean(accs) * 100.0
|
| 80 |
results[task.benchmark] = mean_acc
|
|
|
|
| 81 |
|
| 82 |
return self(
|
| 83 |
eval_name=result_key,
|
|
@@ -85,6 +131,7 @@ class EvalResult:
|
|
| 85 |
org=org,
|
| 86 |
model=model,
|
| 87 |
results=results,
|
|
|
|
| 88 |
precision=precision,
|
| 89 |
revision= config.get("model_sha", ""),
|
| 90 |
still_on_hub=still_on_hub,
|
|
@@ -126,6 +173,7 @@ class EvalResult:
|
|
| 126 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
| 127 |
}
|
| 128 |
|
|
|
|
| 129 |
for task in Tasks:
|
| 130 |
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
| 131 |
|
|
@@ -182,6 +230,9 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
| 182 |
eval_name = eval_result.eval_name
|
| 183 |
if eval_name in eval_results.keys():
|
| 184 |
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
|
|
|
|
|
|
|
|
|
| 185 |
else:
|
| 186 |
eval_results[eval_name] = eval_result
|
| 187 |
|
|
|
|
| 2 |
import json
|
| 3 |
import math
|
| 4 |
import os
|
| 5 |
+
from dataclasses import dataclass, field
|
| 6 |
|
| 7 |
import dateutil
|
| 8 |
import numpy as np
|
| 9 |
|
| 10 |
from src.display.formatting import make_clickable_model
|
| 11 |
+
from src.display.utils import (
|
| 12 |
+
ADDITIONAL_SCORE_FIELDS,
|
| 13 |
+
ADDITIONAL_SCORE_SOURCE_KEYS,
|
| 14 |
+
AutoEvalColumn,
|
| 15 |
+
ModelType,
|
| 16 |
+
Precision,
|
| 17 |
+
Tasks,
|
| 18 |
+
WeightType,
|
| 19 |
+
)
|
| 20 |
from src.submission.check_validity import is_model_on_hub
|
| 21 |
|
| 22 |
|
| 23 |
+
def _score_to_percentage(score):
|
| 24 |
+
"""Convert ratio metrics to percentage while leaving already-percentage scores untouched."""
|
| 25 |
+
if isinstance(score, (int, float)):
|
| 26 |
+
return score * 100 if 0 <= score <= 1 else score
|
| 27 |
+
return None
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def _extract_numeric_metric(metric_container):
|
| 31 |
+
"""Grab the first numeric value from a metric container."""
|
| 32 |
+
if isinstance(metric_container, (int, float)):
|
| 33 |
+
return metric_container
|
| 34 |
+
if isinstance(metric_container, dict):
|
| 35 |
+
for value in metric_container.values():
|
| 36 |
+
if isinstance(value, (int, float)):
|
| 37 |
+
return value
|
| 38 |
+
return None
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def _extract_additional_scores(results):
|
| 42 |
+
"""Extract additional aggregate scores (accent, gender, etc.) from the raw results."""
|
| 43 |
+
scores = {getattr(AutoEvalColumn, field_name).name: None for field_name in ADDITIONAL_SCORE_FIELDS}
|
| 44 |
+
normalized_results = {k.lower(): v for k, v in results.items()}
|
| 45 |
+
|
| 46 |
+
for field_name, candidate_keys in ADDITIONAL_SCORE_SOURCE_KEYS.items():
|
| 47 |
+
metric_value = None
|
| 48 |
+
for candidate_key in candidate_keys:
|
| 49 |
+
normalized_key = candidate_key.lower()
|
| 50 |
+
if normalized_key in normalized_results:
|
| 51 |
+
metric_value = _extract_numeric_metric(normalized_results[normalized_key])
|
| 52 |
+
break
|
| 53 |
+
if metric_value is not None:
|
| 54 |
+
scores[getattr(AutoEvalColumn, field_name).name] = _score_to_percentage(metric_value)
|
| 55 |
+
|
| 56 |
+
return scores
|
| 57 |
+
|
| 58 |
+
|
| 59 |
@dataclass
|
| 60 |
class EvalResult:
|
| 61 |
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|
|
|
|
| 66 |
model: str
|
| 67 |
revision: str # commit hash, "" if main
|
| 68 |
results: dict
|
| 69 |
+
aggregated_scores: dict = field(default_factory=dict)
|
| 70 |
precision: Precision = Precision.Unknown
|
| 71 |
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
| 72 |
weight_type: WeightType = WeightType.Original # Original or Adapter
|
|
|
|
| 123 |
|
| 124 |
mean_acc = np.mean(accs) * 100.0
|
| 125 |
results[task.benchmark] = mean_acc
|
| 126 |
+
aggregated_scores = _extract_additional_scores(data.get("results", {}))
|
| 127 |
|
| 128 |
return self(
|
| 129 |
eval_name=result_key,
|
|
|
|
| 131 |
org=org,
|
| 132 |
model=model,
|
| 133 |
results=results,
|
| 134 |
+
aggregated_scores=aggregated_scores,
|
| 135 |
precision=precision,
|
| 136 |
revision= config.get("model_sha", ""),
|
| 137 |
still_on_hub=still_on_hub,
|
|
|
|
| 173 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
| 174 |
}
|
| 175 |
|
| 176 |
+
data_dict.update(self.aggregated_scores)
|
| 177 |
for task in Tasks:
|
| 178 |
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
| 179 |
|
|
|
|
| 230 |
eval_name = eval_result.eval_name
|
| 231 |
if eval_name in eval_results.keys():
|
| 232 |
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
| 233 |
+
eval_results[eval_name].aggregated_scores.update(
|
| 234 |
+
{k: v for k, v in eval_result.aggregated_scores.items() if v is not None}
|
| 235 |
+
)
|
| 236 |
else:
|
| 237 |
eval_results[eval_name] = eval_result
|
| 238 |
|
src/populate.py
CHANGED
|
@@ -14,11 +14,13 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
| 14 |
all_data_json = [v.to_dict() for v in raw_data]
|
| 15 |
|
| 16 |
df = pd.DataFrame.from_records(all_data_json)
|
| 17 |
-
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
| 18 |
-
df = df[cols].round(decimals=2)
|
| 19 |
|
| 20 |
# filter out if any of the benchmarks have not been produced
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
return df
|
| 23 |
|
| 24 |
|
|
|
|
| 14 |
all_data_json = [v.to_dict() for v in raw_data]
|
| 15 |
|
| 16 |
df = pd.DataFrame.from_records(all_data_json)
|
|
|
|
|
|
|
| 17 |
|
| 18 |
# filter out if any of the benchmarks have not been produced
|
| 19 |
+
benchmark_cols_available = [col for col in benchmark_cols if col in df.columns]
|
| 20 |
+
if benchmark_cols_available:
|
| 21 |
+
df = df[has_no_nan_values(df, benchmark_cols_available)]
|
| 22 |
+
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
| 23 |
+
df = df[cols].round(decimals=2)
|
| 24 |
return df
|
| 25 |
|
| 26 |
|