Spaces:
Runtime error
Runtime error
Commit
·
6b490fd
1
Parent(s):
9883695
More name fixing
Browse files
src/display/utils.py
CHANGED
|
@@ -113,7 +113,7 @@ class EvalQueueColumn: # Queue column
|
|
| 113 |
|
| 114 |
|
| 115 |
baseline_row = {
|
| 116 |
-
AutoEvalColumn.
|
| 117 |
# AutoEvalColumn.revision.name: "N/A",
|
| 118 |
# AutoEvalColumn.precision.name: None,
|
| 119 |
# AutoEvalColumn.merged.name: False,
|
|
@@ -138,7 +138,7 @@ baseline_row = {
|
|
| 138 |
# GSM8K: paper
|
| 139 |
# Define the human baselines
|
| 140 |
human_baseline_row = {
|
| 141 |
-
AutoEvalColumn.
|
| 142 |
# AutoEvalColumn.revision.name: "N/A",
|
| 143 |
# AutoEvalColumn.precision.name: None,
|
| 144 |
# AutoEvalColumn.average.name: 92.75,
|
|
|
|
| 113 |
|
| 114 |
|
| 115 |
baseline_row = {
|
| 116 |
+
AutoEvalColumn.model_name.name: "<p>Baseline</p>",
|
| 117 |
# AutoEvalColumn.revision.name: "N/A",
|
| 118 |
# AutoEvalColumn.precision.name: None,
|
| 119 |
# AutoEvalColumn.merged.name: False,
|
|
|
|
| 138 |
# GSM8K: paper
|
| 139 |
# Define the human baselines
|
| 140 |
human_baseline_row = {
|
| 141 |
+
AutoEvalColumn.model_name.name: "<p>Human performance</p>",
|
| 142 |
# AutoEvalColumn.revision.name: "N/A",
|
| 143 |
# AutoEvalColumn.precision.name: None,
|
| 144 |
# AutoEvalColumn.average.name: 92.75,
|
src/leaderboard/filter_models.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
from src.display.formatting import model_hyperlink
|
| 2 |
from src.display.utils import AutoEvalColumn
|
| 3 |
|
| 4 |
-
|
| 5 |
# Models which have been flagged by users as being problematic for a reason or another
|
| 6 |
# (Model name to forum discussion link)
|
| 7 |
FLAGGED_MODELS = {
|
|
@@ -148,8 +147,8 @@ def flag_models(leaderboard_data: list[dict]):
|
|
| 148 |
f"See discussion #{issue_num}",
|
| 149 |
)
|
| 150 |
model_data[
|
| 151 |
-
AutoEvalColumn.
|
| 152 |
-
] = f"{model_data[AutoEvalColumn.
|
| 153 |
model_data[AutoEvalColumn.not_flagged.name] = False
|
| 154 |
else:
|
| 155 |
model_data[AutoEvalColumn.not_flagged.name] = True
|
|
|
|
| 1 |
from src.display.formatting import model_hyperlink
|
| 2 |
from src.display.utils import AutoEvalColumn
|
| 3 |
|
|
|
|
| 4 |
# Models which have been flagged by users as being problematic for a reason or another
|
| 5 |
# (Model name to forum discussion link)
|
| 6 |
FLAGGED_MODELS = {
|
|
|
|
| 147 |
f"See discussion #{issue_num}",
|
| 148 |
)
|
| 149 |
model_data[
|
| 150 |
+
AutoEvalColumn.model_name.name
|
| 151 |
+
] = f"{model_data[AutoEvalColumn.model_name.name]} has been flagged! {issue_link}"
|
| 152 |
model_data[AutoEvalColumn.not_flagged.name] = False
|
| 153 |
else:
|
| 154 |
model_data[AutoEvalColumn.not_flagged.name] = True
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -1,17 +1,15 @@
|
|
| 1 |
import json
|
| 2 |
-
from pathlib import Path
|
| 3 |
-
from json import JSONDecodeError
|
| 4 |
import logging
|
| 5 |
import math
|
| 6 |
-
|
| 7 |
from dataclasses import dataclass, field
|
| 8 |
-
from
|
|
|
|
|
|
|
| 9 |
|
|
|
|
| 10 |
from tqdm import tqdm
|
| 11 |
from tqdm.contrib.logging import logging_redirect_tqdm
|
| 12 |
|
| 13 |
-
import numpy as np
|
| 14 |
-
|
| 15 |
from src.display.formatting import make_clickable_model
|
| 16 |
from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, parse_datetime
|
| 17 |
|
|
@@ -174,7 +172,7 @@ class EvalResult:
|
|
| 174 |
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
| 175 |
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
| 176 |
AutoEvalColumn.architecture.name: self.architecture,
|
| 177 |
-
AutoEvalColumn.
|
| 178 |
AutoEvalColumn.fullname.name: self.full_model,
|
| 179 |
AutoEvalColumn.revision.name: self.revision,
|
| 180 |
AutoEvalColumn.average.name: average,
|
|
|
|
| 1 |
import json
|
|
|
|
|
|
|
| 2 |
import logging
|
| 3 |
import math
|
|
|
|
| 4 |
from dataclasses import dataclass, field
|
| 5 |
+
from json import JSONDecodeError
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Dict, List, Optional
|
| 8 |
|
| 9 |
+
import numpy as np
|
| 10 |
from tqdm import tqdm
|
| 11 |
from tqdm.contrib.logging import logging_redirect_tqdm
|
| 12 |
|
|
|
|
|
|
|
| 13 |
from src.display.formatting import make_clickable_model
|
| 14 |
from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, parse_datetime
|
| 15 |
|
|
|
|
| 172 |
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
| 173 |
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
| 174 |
AutoEvalColumn.architecture.name: self.architecture,
|
| 175 |
+
AutoEvalColumn.model_name.name: make_clickable_model(self.full_model),
|
| 176 |
AutoEvalColumn.fullname.name: self.full_model,
|
| 177 |
AutoEvalColumn.revision.name: self.revision,
|
| 178 |
AutoEvalColumn.average.name: average,
|