Spaces:
Configuration error
Configuration error
push to have sliders - kwargs - example data and reliability plot
Browse files
app.py
CHANGED
|
@@ -1,24 +1,173 @@
|
|
| 1 |
import evaluate
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
from evaluate.utils import launch_gradio_widget
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
-
module = evaluate.load("jordyvl/ece")
|
| 5 |
-
launch_gradio_widget(module)
|
| 6 |
|
| 7 |
"""
|
| 8 |
DEV: #might be nice to also plot reliability diagram
|
| 9 |
have sliders for kwargs :)
|
| 10 |
|
| 11 |
-
import gradio as gr
|
| 12 |
|
| 13 |
metric = ECE()
|
| 14 |
|
| 15 |
-
iface = gr.Interface(
|
| 16 |
-
fn=compute,
|
| 17 |
-
inputs=gr.inputs.Dataframe(headers=["predictions", "references"], col_width=2, datatype="number"),
|
| 18 |
-
outputs=gr.outputs.Textbox(label="accuracy"),
|
| 19 |
-
description=metric.info.description,
|
| 20 |
-
article=metric.info.citation,
|
| 21 |
-
)
|
| 22 |
|
| 23 |
-
|
| 24 |
-
"""
|
|
|
|
| 1 |
import evaluate
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import ast
|
| 5 |
+
import json
|
| 6 |
+
import gradio as gr
|
| 7 |
from evaluate.utils import launch_gradio_widget
|
| 8 |
+
from ece import ECE
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
sliders = [
|
| 12 |
+
gr.Slider(0, 100, value=10, label="n_bins"),
|
| 13 |
+
gr.Slider(0, 100, value=None, label="bin_range", visible=False), #DEV: need to have a double slider
|
| 14 |
+
gr.Dropdown(choices=["equal-range", "equal-mass"], value="equal-range", label="scheme"),
|
| 15 |
+
gr.Dropdown(choices=["upper-edge", "center"], value="upper-edge", label="proxy"),
|
| 16 |
+
gr.Dropdown(choices=[1, 2, np.inf], value=1, label="p"),
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
slider_defaults = [slider.value for slider in sliders]
|
| 20 |
+
|
| 21 |
+
# example data
|
| 22 |
+
df = dict()
|
| 23 |
+
df["predictions"] = [[0.6, 0.2, 0.2], [0, 0.95, 0.05], [0.7, 0.1, 0.2]]
|
| 24 |
+
df["references"] = [0, 1, 2]
|
| 25 |
+
|
| 26 |
+
component = gr.inputs.Dataframe(
|
| 27 |
+
headers=["predictions", "references"], col_count=2, datatype="number", type="pandas"
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
component.value = [
|
| 31 |
+
[[0.6, 0.2, 0.2], 0],
|
| 32 |
+
[[0.7, 0.1, 0.2], 2],
|
| 33 |
+
[[0, 0.95, 0.05], 1],
|
| 34 |
+
]
|
| 35 |
+
sample_data = [[component] + slider_defaults] ##json.dumps(df)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
metric = ECE()
|
| 39 |
+
# module = evaluate.load("jordyvl/ece")
|
| 40 |
+
# launch_gradio_widget(module)
|
| 41 |
+
|
| 42 |
+
"""
|
| 43 |
+
Switch inputs and compute_fn
|
| 44 |
+
"""
|
| 45 |
+
|
| 46 |
+
def reliability_plot(results):
|
| 47 |
+
#CE, calibrated_acc, empirical_acc, weights_ece
|
| 48 |
+
#{"ECE": ECE[0], "y_bar": ECE[1], "p_bar": ECE[2], "bin_freq": ECE[3]}
|
| 49 |
+
import matplotlib.pyplot as plt
|
| 50 |
+
import seaborn as sns
|
| 51 |
+
sns.set_style('white')
|
| 52 |
+
sns.set_context("paper", font_scale=1) # 2
|
| 53 |
+
# plt.rcParams['figure.figsize'] = [10, 7]
|
| 54 |
+
plt.rcParams['figure.dpi'] = 300
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
fig = plt.figure()
|
| 58 |
+
ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
|
| 59 |
+
ax2 = plt.subplot2grid((3, 1), (2, 0))
|
| 60 |
+
|
| 61 |
+
n_bins = len(results["y_bar"])
|
| 62 |
+
bin_range = [
|
| 63 |
+
results["y_bar"][0] - results["y_bar"][0],
|
| 64 |
+
results["y_bar"][-1],
|
| 65 |
+
] # np.linspace(0, 1, n_bins)
|
| 66 |
+
# if upper edge then minus binsize; same for center [but half]
|
| 67 |
+
|
| 68 |
+
ax1.plot(
|
| 69 |
+
np.linspace(bin_range[0], bin_range[1], n_bins),
|
| 70 |
+
np.linspace(bin_range[0], bin_range[1], n_bins),
|
| 71 |
+
color="darkgreen",
|
| 72 |
+
ls="dotted",
|
| 73 |
+
label="Perfect",
|
| 74 |
+
)
|
| 75 |
+
# ax1.plot(results["y_bar"], results["y_bar"], color="darkblue", label="Perfect")
|
| 76 |
+
|
| 77 |
+
anindices = np.where(~np.isnan(results["p_bar"][:-1]))[0]
|
| 78 |
+
bin_freqs = np.zeros(n_bins)
|
| 79 |
+
bin_freqs[anindices] = results["bin_freq"]
|
| 80 |
+
ax2.hist(results["y_bar"], results["y_bar"], weights=bin_freqs)
|
| 81 |
+
|
| 82 |
+
widths = np.diff(results["y_bar"])
|
| 83 |
+
for j, bin in enumerate(results["y_bar"]):
|
| 84 |
+
perfect = results["y_bar"][j]
|
| 85 |
+
empirical = results["p_bar"][j]
|
| 86 |
+
|
| 87 |
+
if np.isnan(empirical):
|
| 88 |
+
continue
|
| 89 |
+
|
| 90 |
+
ax1.bar([perfect], height=[empirical], width=-widths[j], align="edge", color="lightblue")
|
| 91 |
+
|
| 92 |
+
if perfect == empirical:
|
| 93 |
+
continue
|
| 94 |
+
|
| 95 |
+
acc_plt = ax2.axvline(
|
| 96 |
+
x=results["accuracy"], ls="solid", lw=3, c="black", label="Accuracy"
|
| 97 |
+
)
|
| 98 |
+
conf_plt = ax2.axvline(
|
| 99 |
+
x=results["p_bar_cont"], ls="dotted", lw=3, c="#444", label="Avg. confidence"
|
| 100 |
+
)
|
| 101 |
+
ax2.legend(handles=[acc_plt, conf_plt])
|
| 102 |
+
|
| 103 |
+
#Bin differences
|
| 104 |
+
ax1.set_ylabel("Conditional Expectation")
|
| 105 |
+
ax1.set_ylim([-0.05, 1.05]) #respective to bin range
|
| 106 |
+
ax1.legend(loc="lower right")
|
| 107 |
+
ax1.set_title("Reliability Diagram")
|
| 108 |
+
|
| 109 |
+
#Bin frequencies
|
| 110 |
+
ax2.set_xlabel("Confidence")
|
| 111 |
+
ax2.set_ylabel("Count")
|
| 112 |
+
ax2.legend(loc="upper left")#, ncol=2
|
| 113 |
+
plt.tight_layout()
|
| 114 |
+
return fig
|
| 115 |
+
|
| 116 |
+
def compute_and_plot(data, n_bins, bin_range, scheme, proxy, p):
|
| 117 |
+
# DEV: check on invalid datatypes with better warnings
|
| 118 |
+
|
| 119 |
+
if isinstance(data, pd.DataFrame):
|
| 120 |
+
data.dropna(inplace=True)
|
| 121 |
+
|
| 122 |
+
predictions = [
|
| 123 |
+
ast.literal_eval(prediction) if not isinstance(prediction, list) else prediction
|
| 124 |
+
for prediction in data["predictions"]
|
| 125 |
+
]
|
| 126 |
+
references = [reference for reference in data["references"]]
|
| 127 |
+
|
| 128 |
+
results = metric._compute(
|
| 129 |
+
predictions,
|
| 130 |
+
references,
|
| 131 |
+
n_bins=n_bins,
|
| 132 |
+
# bin_range=None,#not needed
|
| 133 |
+
scheme=scheme,
|
| 134 |
+
proxy=proxy,
|
| 135 |
+
p=p,
|
| 136 |
+
detail=True,
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
plot = reliability_plot(results)
|
| 140 |
+
return results["ECE"], plt.gcf()
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
outputs = [gr.outputs.Textbox(label="ECE"), gr.outputs.Plot(label="Reliability diagram")]
|
| 144 |
+
|
| 145 |
+
iface = gr.Interface(
|
| 146 |
+
fn=compute_and_plot,
|
| 147 |
+
inputs=[component] + sliders,
|
| 148 |
+
outputs=outputs,
|
| 149 |
+
description=metric.info.description,
|
| 150 |
+
article=metric.info.citation,
|
| 151 |
+
# examples=sample_data
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
# ValueError: Examples argument must either be a directory or a nested list, where each sublist represents a set of inputs.
|
| 155 |
+
|
| 156 |
+
iface.launch()
|
| 157 |
+
|
| 158 |
+
# dict = {"ECE": ECE[0], "y_bar": ECE[1], "p_bar": ECE[2], "bin_freq": ECE[3]}
|
| 159 |
+
|
| 160 |
+
# references=[0, 1, 2], predictions=)
|
| 161 |
+
# https://gradio.app/getting_started/#multiple-inputs-and-outputs
|
| 162 |
+
## fix with sliders for all kwargs
|
| 163 |
|
|
|
|
|
|
|
| 164 |
|
| 165 |
"""
|
| 166 |
DEV: #might be nice to also plot reliability diagram
|
| 167 |
have sliders for kwargs :)
|
| 168 |
|
|
|
|
| 169 |
|
| 170 |
metric = ECE()
|
| 171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
|
| 173 |
+
"""
|
|
|
ece.py
CHANGED
|
@@ -18,6 +18,8 @@
|
|
| 18 |
import evaluate
|
| 19 |
import datasets
|
| 20 |
import numpy as np
|
|
|
|
|
|
|
| 21 |
|
| 22 |
|
| 23 |
# TODO: Add BibTeX citation
|
|
@@ -161,7 +163,7 @@ def bin_calibrated_accuracy(bins, proxy="upper-edge"):
|
|
| 161 |
return bins[:-1] + np.diff(bins) / 2
|
| 162 |
|
| 163 |
|
| 164 |
-
def CE_estimate(y_correct, P, bins=None, p=1, proxy="upper-edge"):
|
| 165 |
"""
|
| 166 |
y_correct: binary (N x 1)
|
| 167 |
P: normalized (N x 1) either max or per class
|
|
@@ -187,6 +189,8 @@ def CE_estimate(y_correct, P, bins=None, p=1, proxy="upper-edge"):
|
|
| 187 |
elif np.isinf(p): # max-ECE
|
| 188 |
CE = np.max(abs(empirical_acc[anindices] - calibrated_acc[anindices]))
|
| 189 |
|
|
|
|
|
|
|
| 190 |
return CE
|
| 191 |
|
| 192 |
|
|
@@ -196,7 +200,10 @@ def top_1_CE(Y, P, **kwargs):
|
|
| 196 |
bins = create_bins(
|
| 197 |
n_bins=kwargs["n_bins"], bin_range=kwargs["bin_range"], scheme=kwargs["scheme"], P=p_max
|
| 198 |
)
|
| 199 |
-
|
|
|
|
|
|
|
|
|
|
| 200 |
|
| 201 |
|
| 202 |
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
|
@@ -230,7 +237,14 @@ class ECE(evaluate.EvaluationModule):
|
|
| 230 |
)
|
| 231 |
|
| 232 |
def init_kwargs(
|
| 233 |
-
self,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
):
|
| 235 |
# super(evaluate.EvaluationModule, self).__init__(**kwargs)
|
| 236 |
self.n_bins = n_bins
|
|
@@ -238,6 +252,7 @@ class ECE(evaluate.EvaluationModule):
|
|
| 238 |
self.scheme = scheme
|
| 239 |
self.proxy = proxy
|
| 240 |
self.p = p
|
|
|
|
| 241 |
|
| 242 |
def _compute(self, predictions, references, **kwargs):
|
| 243 |
|
|
@@ -266,6 +281,36 @@ class ECE(evaluate.EvaluationModule):
|
|
| 266 |
|
| 267 |
"""Returns the scores"""
|
| 268 |
ECE = top_1_CE(references, predictions, **self.__dict__)
|
|
|
|
|
|
|
| 269 |
return {
|
| 270 |
"ECE": ECE,
|
| 271 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
import evaluate
|
| 19 |
import datasets
|
| 20 |
import numpy as np
|
| 21 |
+
from typing import Dict, Optional
|
| 22 |
+
|
| 23 |
|
| 24 |
|
| 25 |
# TODO: Add BibTeX citation
|
|
|
|
| 163 |
return bins[:-1] + np.diff(bins) / 2
|
| 164 |
|
| 165 |
|
| 166 |
+
def CE_estimate(y_correct, P, bins=None, p=1, proxy="upper-edge", detail=False):
|
| 167 |
"""
|
| 168 |
y_correct: binary (N x 1)
|
| 169 |
P: normalized (N x 1) either max or per class
|
|
|
|
| 189 |
elif np.isinf(p): # max-ECE
|
| 190 |
CE = np.max(abs(empirical_acc[anindices] - calibrated_acc[anindices]))
|
| 191 |
|
| 192 |
+
if detail:
|
| 193 |
+
return CE, calibrated_acc, empirical_acc, weights_ece
|
| 194 |
return CE
|
| 195 |
|
| 196 |
|
|
|
|
| 200 |
bins = create_bins(
|
| 201 |
n_bins=kwargs["n_bins"], bin_range=kwargs["bin_range"], scheme=kwargs["scheme"], P=p_max
|
| 202 |
)
|
| 203 |
+
CE = CE_estimate(y_correct, p_max, bins=bins, proxy=kwargs["proxy"], detail=kwargs["detail"])
|
| 204 |
+
if self.detail:
|
| 205 |
+
return {"ECE": CE[0], "y_bar": CE[1], "p_bar": CE[2], "bin_freq": CE[3], "p_bar_cont": np.mean(p_max,-1), "accuracy": np.mean(y_correct)}
|
| 206 |
+
return CE
|
| 207 |
|
| 208 |
|
| 209 |
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
|
|
|
| 237 |
)
|
| 238 |
|
| 239 |
def init_kwargs(
|
| 240 |
+
self,
|
| 241 |
+
n_bins: int = 10,
|
| 242 |
+
bin_range: Optional[int] = [0, 1],
|
| 243 |
+
scheme: str = "equal-range",
|
| 244 |
+
proxy: str = "upper-edge",
|
| 245 |
+
p=1,
|
| 246 |
+
detail: bool = False,
|
| 247 |
+
**kwargs,
|
| 248 |
):
|
| 249 |
# super(evaluate.EvaluationModule, self).__init__(**kwargs)
|
| 250 |
self.n_bins = n_bins
|
|
|
|
| 252 |
self.scheme = scheme
|
| 253 |
self.proxy = proxy
|
| 254 |
self.p = p
|
| 255 |
+
self.detail = detail
|
| 256 |
|
| 257 |
def _compute(self, predictions, references, **kwargs):
|
| 258 |
|
|
|
|
| 281 |
|
| 282 |
"""Returns the scores"""
|
| 283 |
ECE = top_1_CE(references, predictions, **self.__dict__)
|
| 284 |
+
if self.detail:
|
| 285 |
+
return ECE
|
| 286 |
return {
|
| 287 |
"ECE": ECE,
|
| 288 |
}
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
def test_ECE():
|
| 292 |
+
N = 10 # N evaluation instances {(x_i,y_i)}_{i=1}^N
|
| 293 |
+
K = 5 # K class problem
|
| 294 |
+
|
| 295 |
+
def random_mc_instance(concentration=1, onehot=False):
|
| 296 |
+
reference = np.argmax(
|
| 297 |
+
np.random.dirichlet(([concentration for _ in range(K)])), -1
|
| 298 |
+
) # class targets
|
| 299 |
+
prediction = np.random.dirichlet(([concentration for _ in range(K)])) # probabilities
|
| 300 |
+
if onehot:
|
| 301 |
+
reference = np.eye(K)[np.argmax(reference, -1)]
|
| 302 |
+
return reference, prediction
|
| 303 |
+
|
| 304 |
+
references, predictions = list(zip(*[random_mc_instance() for i in range(N)]))
|
| 305 |
+
references = np.array(references, dtype=np.int64)
|
| 306 |
+
predictions = np.array(predictions, dtype=np.float32)
|
| 307 |
+
res = ECE()._compute(predictions, references)
|
| 308 |
+
print(f"ECE: {res['ECE']}")
|
| 309 |
+
|
| 310 |
+
res = ECE()._compute(predictions, references, detail=True)
|
| 311 |
+
import pdb; pdb.set_trace() # breakpoint 25274412 //
|
| 312 |
+
|
| 313 |
+
print(f"ECE: {res['ECE']}")
|
| 314 |
+
|
| 315 |
+
if __name__ == '__main__':
|
| 316 |
+
test_ECE()
|
tests.py
CHANGED
|
@@ -1,39 +1,12 @@
|
|
| 1 |
import numpy as np
|
| 2 |
|
| 3 |
test_cases = [
|
|
|
|
|
|
|
| 4 |
{
|
| 5 |
-
"predictions": [0, 0],
|
| 6 |
-
"references": [
|
| 7 |
-
"result": {"
|
| 8 |
},
|
| 9 |
-
{
|
| 10 |
-
"predictions": [1, 1],
|
| 11 |
-
"references": [1, 1],
|
| 12 |
-
"result": {"metric_score": 1}
|
| 13 |
-
},
|
| 14 |
-
{
|
| 15 |
-
"predictions": [1, 0],
|
| 16 |
-
"references": [1, 1],
|
| 17 |
-
"result": {"metric_score": 0.5}
|
| 18 |
-
}
|
| 19 |
]
|
| 20 |
|
| 21 |
-
|
| 22 |
-
def test_ECE():
|
| 23 |
-
N = 10 # N evaluation instances {(x_i,y_i)}_{i=1}^N
|
| 24 |
-
K = 5 # K class problem
|
| 25 |
-
|
| 26 |
-
def random_mc_instance(concentration=1, onehot=False):
|
| 27 |
-
reference = np.argmax(
|
| 28 |
-
np.random.dirichlet(([concentration for _ in range(K)])), -1
|
| 29 |
-
) # class targets
|
| 30 |
-
prediction = np.random.dirichlet(([concentration for _ in range(K)])) # probabilities
|
| 31 |
-
if onehot:
|
| 32 |
-
reference = np.eye(K)[np.argmax(reference, -1)]
|
| 33 |
-
return reference, prediction
|
| 34 |
-
|
| 35 |
-
references, predictions = list(zip(*[random_mc_instance() for i in range(N)]))
|
| 36 |
-
references = np.array(references, dtype=np.int64)
|
| 37 |
-
predictions = np.array(predictions, dtype=np.float32)
|
| 38 |
-
res = ECE()._compute(predictions, references)
|
| 39 |
-
print(f"ECE: {res['ECE']}")
|
|
|
|
| 1 |
import numpy as np
|
| 2 |
|
| 3 |
test_cases = [
|
| 4 |
+
{"predictions": [[0, 1], [1, 0]], "references": [1, 0], "result": {"ECE": 0}},
|
| 5 |
+
{"predictions": [[0, 1], [1, 0]], "references": [0, 1], "result": {"ECE": 1}},
|
| 6 |
{
|
| 7 |
+
"predictions": [[0, 0.1, 0.9], [0.2, 0.8, 0]],
|
| 8 |
+
"references": [2, 0], # kwargs?
|
| 9 |
+
"result": {"ECE": >0<1},
|
| 10 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
]
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|