pourbahman commited on
Commit
c50d20c
·
1 Parent(s): 72a9f71

add columns

Browse files
app.py CHANGED
@@ -60,16 +60,17 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
60
  def init_leaderboard(dataframe):
61
  if dataframe is None or dataframe.empty:
62
  raise ValueError("Leaderboard DataFrame is empty or None.")
 
63
  return Leaderboard(
64
  value=dataframe,
65
- datatype=[c.type for c in fields(AutoEvalColumn)],
66
  select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
  label="Select Columns to Display:",
70
  ),
71
  search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
  filter_columns=[
74
  ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
  ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
@@ -201,4 +202,4 @@ with demo:
201
  scheduler = BackgroundScheduler()
202
  scheduler.add_job(restart_space, "interval", seconds=1800)
203
  scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
60
  def init_leaderboard(dataframe):
61
  if dataframe is None or dataframe.empty:
62
  raise ValueError("Leaderboard DataFrame is empty or None.")
63
+ visible_columns = [c for c in fields(AutoEvalColumn) if not c.hidden]
64
  return Leaderboard(
65
  value=dataframe,
66
+ datatype=[c.type for c in visible_columns],
67
  select_columns=SelectColumns(
68
+ default_selection=[c.name for c in visible_columns if c.displayed_by_default],
69
+ cant_deselect=[c.name for c in visible_columns if c.never_hidden],
70
  label="Select Columns to Display:",
71
  ),
72
  search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
73
+ hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden and c.name in dataframe.columns],
74
  filter_columns=[
75
  ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
76
  ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
 
202
  scheduler = BackgroundScheduler()
203
  scheduler.add_job(restart_space, "interval", seconds=1800)
204
  scheduler.start()
205
+ demo.queue(default_concurrency_limit=40).launch()
src/display/css_html_js.py CHANGED
@@ -103,3 +103,20 @@ get_window_url_params = """
103
  return url_params;
104
  }
105
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  return url_params;
104
  }
105
  """
106
+
107
+ # src/display/css_html_js.py
108
+
109
+ custom_css = """
110
+ /* ... (کدهای قبلی بدون تغییر باقی بمانند) ... */
111
+
112
+ #box-filter > .form{
113
+ border: 0
114
+ }
115
+
116
+ /* --- کد جدید برای شکستن خط در هدر جدول --- */
117
+ th {
118
+ white-space: pre-wrap !important;
119
+ text-align: center !important;
120
+ vertical-align: bottom !important;
121
+ }
122
+ """
src/display/utils.py CHANGED
@@ -20,6 +20,19 @@ class ColumnContent:
20
  hidden: bool = False
21
  never_hidden: bool = False
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
@@ -27,8 +40,11 @@ auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent(
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
  #Scores
29
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 
 
 
30
  for task in Tasks:
31
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # Model information
33
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
@@ -107,4 +123,3 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
107
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
 
109
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
110
-
 
20
  hidden: bool = False
21
  never_hidden: bool = False
22
 
23
+ # Aggregated metrics displayed alongside the global average
24
+ ADDITIONAL_SCORE_SPECS = [
25
+ ("accent_oriented", "Accent\n(SER|WER|SW-WER)"),
26
+ ("acoustic_env_oriented", "Acoustic\n(SER|WER|SW-WER)"),
27
+ ("age_oriented", "Age\n(SER|WER|SW-WER)"),
28
+ ("formality_oriented", "Formality\n(SER|WER|SW-WER)"),
29
+ ("gender_oriented", "Gender\n(SER|WER|SW-WER)"),
30
+ ("num_of_speaker_oriented", "#Speakers\n(SER|WER|SW-WER)"),
31
+ ("spontaneous_oriented", "Spontaneous\n(SER|WER|SW-WER)"),
32
+ ]
33
+ ADDITIONAL_SCORE_FIELDS = [name for name, _ in ADDITIONAL_SCORE_SPECS]
34
+ ADDITIONAL_SCORE_SOURCE_KEYS = {name: [name.replace("_", "-"), name] for name in ADDITIONAL_SCORE_FIELDS}
35
+
36
  ## Leaderboard columns
37
  auto_eval_column_dict = []
38
  # Init
 
40
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
41
  #Scores
42
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
43
+ for field_name, display_name in ADDITIONAL_SCORE_SPECS:
44
+ auto_eval_column_dict.append([field_name, ColumnContent, ColumnContent(display_name, "number", True)])
45
+ # Hide task-specific metrics from the selector; only expose the aggregate.
46
  for task in Tasks:
47
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", False, True)])
48
  # Model information
49
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
50
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 
123
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
124
 
125
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 
src/leaderboard/read_evals.py CHANGED
@@ -2,16 +2,60 @@ import glob
2
  import json
3
  import math
4
  import os
5
- from dataclasses import dataclass
6
 
7
  import dateutil
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
 
 
 
 
 
 
 
 
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  @dataclass
16
  class EvalResult:
17
  """Represents one full evaluation. Built from a combination of the result and request file for a given run.
@@ -22,6 +66,7 @@ class EvalResult:
22
  model: str
23
  revision: str # commit hash, "" if main
24
  results: dict
 
25
  precision: Precision = Precision.Unknown
26
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
  weight_type: WeightType = WeightType.Original # Original or Adapter
@@ -78,6 +123,7 @@ class EvalResult:
78
 
79
  mean_acc = np.mean(accs) * 100.0
80
  results[task.benchmark] = mean_acc
 
81
 
82
  return self(
83
  eval_name=result_key,
@@ -85,6 +131,7 @@ class EvalResult:
85
  org=org,
86
  model=model,
87
  results=results,
 
88
  precision=precision,
89
  revision= config.get("model_sha", ""),
90
  still_on_hub=still_on_hub,
@@ -126,6 +173,7 @@ class EvalResult:
126
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
127
  }
128
 
 
129
  for task in Tasks:
130
  data_dict[task.value.col_name] = self.results[task.value.benchmark]
131
 
@@ -182,6 +230,9 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
182
  eval_name = eval_result.eval_name
183
  if eval_name in eval_results.keys():
184
  eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
 
 
 
185
  else:
186
  eval_results[eval_name] = eval_result
187
 
 
2
  import json
3
  import math
4
  import os
5
+ from dataclasses import dataclass, field
6
 
7
  import dateutil
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
+ from src.display.utils import (
12
+ ADDITIONAL_SCORE_FIELDS,
13
+ ADDITIONAL_SCORE_SOURCE_KEYS,
14
+ AutoEvalColumn,
15
+ ModelType,
16
+ Precision,
17
+ Tasks,
18
+ WeightType,
19
+ )
20
  from src.submission.check_validity import is_model_on_hub
21
 
22
 
23
+ def _score_to_percentage(score):
24
+ """Convert ratio metrics to percentage while leaving already-percentage scores untouched."""
25
+ if isinstance(score, (int, float)):
26
+ return score * 100 if 0 <= score <= 1 else score
27
+ return None
28
+
29
+
30
+ def _extract_numeric_metric(metric_container):
31
+ """Grab the first numeric value from a metric container."""
32
+ if isinstance(metric_container, (int, float)):
33
+ return metric_container
34
+ if isinstance(metric_container, dict):
35
+ for value in metric_container.values():
36
+ if isinstance(value, (int, float)):
37
+ return value
38
+ return None
39
+
40
+
41
+ def _extract_additional_scores(results):
42
+ """Extract additional aggregate scores (accent, gender, etc.) from the raw results."""
43
+ scores = {getattr(AutoEvalColumn, field_name).name: None for field_name in ADDITIONAL_SCORE_FIELDS}
44
+ normalized_results = {k.lower(): v for k, v in results.items()}
45
+
46
+ for field_name, candidate_keys in ADDITIONAL_SCORE_SOURCE_KEYS.items():
47
+ metric_value = None
48
+ for candidate_key in candidate_keys:
49
+ normalized_key = candidate_key.lower()
50
+ if normalized_key in normalized_results:
51
+ metric_value = _extract_numeric_metric(normalized_results[normalized_key])
52
+ break
53
+ if metric_value is not None:
54
+ scores[getattr(AutoEvalColumn, field_name).name] = _score_to_percentage(metric_value)
55
+
56
+ return scores
57
+
58
+
59
  @dataclass
60
  class EvalResult:
61
  """Represents one full evaluation. Built from a combination of the result and request file for a given run.
 
66
  model: str
67
  revision: str # commit hash, "" if main
68
  results: dict
69
+ aggregated_scores: dict = field(default_factory=dict)
70
  precision: Precision = Precision.Unknown
71
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
72
  weight_type: WeightType = WeightType.Original # Original or Adapter
 
123
 
124
  mean_acc = np.mean(accs) * 100.0
125
  results[task.benchmark] = mean_acc
126
+ aggregated_scores = _extract_additional_scores(data.get("results", {}))
127
 
128
  return self(
129
  eval_name=result_key,
 
131
  org=org,
132
  model=model,
133
  results=results,
134
+ aggregated_scores=aggregated_scores,
135
  precision=precision,
136
  revision= config.get("model_sha", ""),
137
  still_on_hub=still_on_hub,
 
173
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
174
  }
175
 
176
+ data_dict.update(self.aggregated_scores)
177
  for task in Tasks:
178
  data_dict[task.value.col_name] = self.results[task.value.benchmark]
179
 
 
230
  eval_name = eval_result.eval_name
231
  if eval_name in eval_results.keys():
232
  eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
233
+ eval_results[eval_name].aggregated_scores.update(
234
+ {k: v for k, v in eval_result.aggregated_scores.items() if v is not None}
235
+ )
236
  else:
237
  eval_results[eval_name] = eval_result
238
 
src/populate.py CHANGED
@@ -14,11 +14,13 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
- df = df[cols].round(decimals=2)
19
 
20
  # filter out if any of the benchmarks have not been produced
21
- df = df[has_no_nan_values(df, benchmark_cols)]
 
 
 
 
22
  return df
23
 
24
 
 
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
 
 
17
 
18
  # filter out if any of the benchmarks have not been produced
19
+ benchmark_cols_available = [col for col in benchmark_cols if col in df.columns]
20
+ if benchmark_cols_available:
21
+ df = df[has_no_nan_values(df, benchmark_cols_available)]
22
+ df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
23
+ df = df[cols].round(decimals=2)
24
  return df
25
 
26