Spaces:
Runtime error
Runtime error
christodoulos.constantinides@ibm.com
commited on
Commit
·
fa0f3d4
1
Parent(s):
2c9911b
update
Browse files- app.py +9 -2
- src/about.py +5 -1
- src/display/utils.py +7 -1
- src/leaderboard/read_evals.py +1 -1
app.py
CHANGED
|
@@ -5,7 +5,7 @@ from apscheduler.schedulers.background import BackgroundScheduler
|
|
| 5 |
from huggingface_hub import snapshot_download
|
| 6 |
import plotly.graph_objects as go
|
| 7 |
import plotly.express as px
|
| 8 |
-
from src.about import Tasks, AssetTasks
|
| 9 |
|
| 10 |
from src.about import (
|
| 11 |
CITATION_BUTTON_LABEL,
|
|
@@ -21,10 +21,12 @@ from src.display.utils import (
|
|
| 21 |
ASSET_BENCHMARK_COLS,
|
| 22 |
COLS,
|
| 23 |
ASSET_COLS,
|
|
|
|
| 24 |
EVAL_COLS,
|
| 25 |
EVAL_TYPES,
|
| 26 |
AutoEvalColumn,
|
| 27 |
AutoEvalColumnAsset,
|
|
|
|
| 28 |
ModelType,
|
| 29 |
fields,
|
| 30 |
WeightType,
|
|
@@ -63,6 +65,8 @@ print(ASSET_COLS)
|
|
| 63 |
|
| 64 |
ASSET_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ASSET_COLS, ASSET_BENCHMARK_COLS, AssetTasks)
|
| 65 |
|
|
|
|
|
|
|
| 66 |
|
| 67 |
(
|
| 68 |
finished_eval_queue_df,
|
|
@@ -165,8 +169,11 @@ with demo:
|
|
| 165 |
|
| 166 |
with gr.TabItem("🛠️ Asset Benchmark", elem_id="llm-benchmark-asset-tab-table", id=1):
|
| 167 |
leaderboard = init_leaderboard(ASSET_LEADERBOARD_DF, AutoEvalColumnAsset)
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
-
with gr.TabItem("📊 Performance Plot", elem_id="llm-benchmark-tab-table", id=
|
| 170 |
print(LEADERBOARD_DF.columns)
|
| 171 |
# gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 172 |
perf_plot = gr.components.Plot(
|
|
|
|
| 5 |
from huggingface_hub import snapshot_download
|
| 6 |
import plotly.graph_objects as go
|
| 7 |
import plotly.express as px
|
| 8 |
+
from src.about import Tasks, AssetTasks, UncertaintyTasks
|
| 9 |
|
| 10 |
from src.about import (
|
| 11 |
CITATION_BUTTON_LABEL,
|
|
|
|
| 21 |
ASSET_BENCHMARK_COLS,
|
| 22 |
COLS,
|
| 23 |
ASSET_COLS,
|
| 24 |
+
UNCERTAINTY_COLS,
|
| 25 |
EVAL_COLS,
|
| 26 |
EVAL_TYPES,
|
| 27 |
AutoEvalColumn,
|
| 28 |
AutoEvalColumnAsset,
|
| 29 |
+
AutoEvalColumnUncertainty,
|
| 30 |
ModelType,
|
| 31 |
fields,
|
| 32 |
WeightType,
|
|
|
|
| 65 |
|
| 66 |
ASSET_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ASSET_COLS, ASSET_BENCHMARK_COLS, AssetTasks)
|
| 67 |
|
| 68 |
+
UNCERTAINTY_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, UNCERTAINTY_COLS, ASSET_BENCHMARK_COLS, UncertaintyTasks)
|
| 69 |
+
|
| 70 |
|
| 71 |
(
|
| 72 |
finished_eval_queue_df,
|
|
|
|
| 169 |
|
| 170 |
with gr.TabItem("🛠️ Asset Benchmark", elem_id="llm-benchmark-asset-tab-table", id=1):
|
| 171 |
leaderboard = init_leaderboard(ASSET_LEADERBOARD_DF, AutoEvalColumnAsset)
|
| 172 |
+
|
| 173 |
+
with gr.TabItem("😵💫 Uncertainty Benchmark", elem_id="llm-benchmark-asset-tab-table", id=2):
|
| 174 |
+
leaderboard = init_leaderboard(UNCERTAINTY_LEADERBOARD_DF, AutoEvalColumnUncertainty)
|
| 175 |
|
| 176 |
+
with gr.TabItem("📊 Performance Plot", elem_id="llm-benchmark-tab-table", id=3):
|
| 177 |
print(LEADERBOARD_DF.columns)
|
| 178 |
# gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 179 |
perf_plot = gr.components.Plot(
|
src/about.py
CHANGED
|
@@ -17,7 +17,6 @@ class Tasks(Enum):
|
|
| 17 |
task2 = Task("acc_el", "acc_el", "Acc_El")
|
| 18 |
task3 = Task("acc_perturb", "perturb_score", "Acc_Perturb")
|
| 19 |
task4 = Task("score_consistency", "consist_score", "Consistency_Score")
|
| 20 |
-
task5 = Task("uncertainty", "uncertainty_score", "Uncertainty_Score")
|
| 21 |
|
| 22 |
class AssetTasks(Enum):
|
| 23 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
|
@@ -32,6 +31,11 @@ class AssetTasks(Enum):
|
|
| 32 |
task8 = Task("acc_fan", "acc_fan", "acc_fan")
|
| 33 |
task9 = Task("acc_power_transformer", "acc_power_transformer", "acc_power_transformer")
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
# {
|
| 37 |
# "acc_overall": {
|
|
|
|
| 17 |
task2 = Task("acc_el", "acc_el", "Acc_El")
|
| 18 |
task3 = Task("acc_perturb", "perturb_score", "Acc_Perturb")
|
| 19 |
task4 = Task("score_consistency", "consist_score", "Consistency_Score")
|
|
|
|
| 20 |
|
| 21 |
class AssetTasks(Enum):
|
| 22 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
|
|
|
| 31 |
task8 = Task("acc_fan", "acc_fan", "acc_fan")
|
| 32 |
task9 = Task("acc_power_transformer", "acc_power_transformer", "acc_power_transformer")
|
| 33 |
|
| 34 |
+
class UncertaintyTasks(Enum):
|
| 35 |
+
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 36 |
+
task0 = Task("fmsr_ss", "fmsr_ss", "fmsr_ss")
|
| 37 |
+
task1 = Task("fmsr_coverage_rate", "fmsr_coverage_rate", "fmsr_coverage_rate")
|
| 38 |
+
task2 = Task("fmsr_uacc", "fmsr_uacc", "fmsr_uacc")
|
| 39 |
|
| 40 |
# {
|
| 41 |
# "acc_overall": {
|
src/display/utils.py
CHANGED
|
@@ -3,7 +3,7 @@ from enum import Enum
|
|
| 3 |
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
-
from src.about import Tasks, AssetTasks
|
| 7 |
|
| 8 |
def fields(raw_class):
|
| 9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
@@ -50,6 +50,9 @@ auto_eval_column_asset_dict = get_auto_eval_column_dict(AssetTasks)
|
|
| 50 |
# We use make dataclass to dynamically fill the scores from Tasks
|
| 51 |
AutoEvalColumnAsset = make_dataclass("AutoEvalColumnAsset", auto_eval_column_asset_dict, frozen=True)
|
| 52 |
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
## For the queue columns in the submission tab
|
| 55 |
@dataclass(frozen=True)
|
|
@@ -111,10 +114,13 @@ class Precision(Enum):
|
|
| 111 |
# Column selection
|
| 112 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
| 113 |
ASSET_COLS = [c.name for c in fields(AutoEvalColumnAsset) if not c.hidden]
|
|
|
|
|
|
|
| 114 |
|
| 115 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
| 116 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 117 |
|
| 118 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
| 119 |
ASSET_BENCHMARK_COLS = [t.value.col_name for t in AssetTasks]
|
|
|
|
| 120 |
|
|
|
|
| 3 |
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
+
from src.about import Tasks, AssetTasks, UncertaintyTasks
|
| 7 |
|
| 8 |
def fields(raw_class):
|
| 9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
|
| 50 |
# We use make dataclass to dynamically fill the scores from Tasks
|
| 51 |
AutoEvalColumnAsset = make_dataclass("AutoEvalColumnAsset", auto_eval_column_asset_dict, frozen=True)
|
| 52 |
|
| 53 |
+
auto_eval_column_uncertainty_dict = get_auto_eval_column_dict(UncertaintyTasks)
|
| 54 |
+
# We use make dataclass to dynamically fill the scores from Tasks
|
| 55 |
+
AutoEvalColumnUncertainty = make_dataclass("AutoEvalColumnUncertainty", auto_eval_column_uncertainty_dict, frozen=True)
|
| 56 |
|
| 57 |
## For the queue columns in the submission tab
|
| 58 |
@dataclass(frozen=True)
|
|
|
|
| 114 |
# Column selection
|
| 115 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
| 116 |
ASSET_COLS = [c.name for c in fields(AutoEvalColumnAsset) if not c.hidden]
|
| 117 |
+
UNCERTAINTY_COLS = [c.name for c in fields(AutoEvalColumnUncertainty) if not c.hidden]
|
| 118 |
+
|
| 119 |
|
| 120 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
| 121 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 122 |
|
| 123 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
| 124 |
ASSET_BENCHMARK_COLS = [t.value.col_name for t in AssetTasks]
|
| 125 |
+
ASSET_BENCHMARK_COLS = [t.value.col_name for t in UncertaintyTasks]
|
| 126 |
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -114,7 +114,7 @@ class EvalResult:
|
|
| 114 |
def to_dict(self, task_class):
|
| 115 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 116 |
#ignore uncertainty for overall calculation
|
| 117 |
-
scores = [v for k, v in zip(self.results.keys(), self.results.values()) if v is not None
|
| 118 |
average = sum(scores) / len(scores)
|
| 119 |
# average = sum([v for v in self.results.values() if v is not None]) / len(task_class)
|
| 120 |
data_dict = {
|
|
|
|
| 114 |
def to_dict(self, task_class):
|
| 115 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 116 |
#ignore uncertainty for overall calculation
|
| 117 |
+
scores = [v for k, v in zip(self.results.keys(), self.results.values()) if v is not None]
|
| 118 |
average = sum(scores) / len(scores)
|
| 119 |
# average = sum([v for v in self.results.values() if v is not None]) / len(task_class)
|
| 120 |
data_dict = {
|