Spaces:

jordyvl
/

ece

Configuration error

App Files Files Community

jordyvl commited on Jun 9, 2022

Commit

3f722df

1 Parent(s): 1519667

push to have sliders - kwargs - example data and reliability plot

Browse files

Files changed (3) hide show

app.py +161 -12
ece.py +48 -3
tests.py +5 -32

app.py CHANGED Viewed

@@ -1,24 +1,173 @@
 import evaluate
 from evaluate.utils import launch_gradio_widget
-module = evaluate.load("jordyvl/ece")
-launch_gradio_widget(module)
 """
 DEV: #might be nice to also plot reliability diagram
 have sliders for kwargs :)
-import gradio as gr
 metric = ECE()
-iface = gr.Interface(
-  fn=compute,
-  inputs=gr.inputs.Dataframe(headers=["predictions", "references"], col_width=2, datatype="number"),
-  outputs=gr.outputs.Textbox(label="accuracy"),
-  description=metric.info.description,
-  article=metric.info.citation,
- )
-iface.launch()
-"""

 import evaluate
+import numpy as np
+import pandas as pd
+import ast
+import json
+import gradio as gr
 from evaluate.utils import launch_gradio_widget
+from ece import ECE
+sliders = [
+    gr.Slider(0, 100, value=10, label="n_bins"),
+    gr.Slider(0, 100, value=None, label="bin_range", visible=False), #DEV: need to have a double slider
+    gr.Dropdown(choices=["equal-range", "equal-mass"], value="equal-range", label="scheme"),
+    gr.Dropdown(choices=["upper-edge", "center"], value="upper-edge", label="proxy"),
+    gr.Dropdown(choices=[1, 2, np.inf], value=1, label="p"),
+]
+slider_defaults = [slider.value for slider in sliders]
+# example data
+df = dict()
+df["predictions"] = [[0.6, 0.2, 0.2], [0, 0.95, 0.05], [0.7, 0.1, 0.2]]
+df["references"] = [0, 1, 2]
+component = gr.inputs.Dataframe(
+    headers=["predictions", "references"], col_count=2, datatype="number", type="pandas"
+)
+component.value = [
+    [[0.6, 0.2, 0.2], 0],
+    [[0.7, 0.1, 0.2], 2],
+    [[0, 0.95, 0.05], 1],
+]
+sample_data = [[component] + slider_defaults]  ##json.dumps(df)
+metric = ECE()
+# module = evaluate.load("jordyvl/ece")
+# launch_gradio_widget(module)
+"""
+Switch inputs and compute_fn
+"""
+def reliability_plot(results):
+    #CE, calibrated_acc, empirical_acc, weights_ece
+    #{"ECE": ECE[0], "y_bar": ECE[1], "p_bar": ECE[2], "bin_freq": ECE[3]}
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+    sns.set_style('white')
+    sns.set_context("paper", font_scale=1)  # 2
+    # plt.rcParams['figure.figsize'] = [10, 7]
+    plt.rcParams['figure.dpi'] = 300
+    fig = plt.figure()
+    ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
+    ax2 = plt.subplot2grid((3, 1), (2, 0))
+    n_bins = len(results["y_bar"])
+    bin_range = [
+        results["y_bar"][0] - results["y_bar"][0],
+        results["y_bar"][-1],
+    ]  # np.linspace(0, 1, n_bins)
+    # if upper edge then minus binsize; same for center [but half]
+    ax1.plot(
+        np.linspace(bin_range[0], bin_range[1], n_bins),
+        np.linspace(bin_range[0], bin_range[1], n_bins),
+        color="darkgreen",
+        ls="dotted",
+        label="Perfect",
+    )
+    # ax1.plot(results["y_bar"], results["y_bar"], color="darkblue", label="Perfect")
+    anindices = np.where(~np.isnan(results["p_bar"][:-1]))[0]
+    bin_freqs = np.zeros(n_bins)
+    bin_freqs[anindices] = results["bin_freq"]
+    ax2.hist(results["y_bar"], results["y_bar"], weights=bin_freqs)
+    widths = np.diff(results["y_bar"])
+    for j, bin in enumerate(results["y_bar"]):
+        perfect = results["y_bar"][j]
+        empirical = results["p_bar"][j]
+        if np.isnan(empirical):
+            continue
+        ax1.bar([perfect], height=[empirical], width=-widths[j], align="edge", color="lightblue")
+        if perfect == empirical:
+            continue
+    acc_plt = ax2.axvline(
+        x=results["accuracy"], ls="solid", lw=3, c="black", label="Accuracy"
+    )
+    conf_plt = ax2.axvline(
+        x=results["p_bar_cont"], ls="dotted", lw=3, c="#444", label="Avg. confidence"
+    )
+    ax2.legend(handles=[acc_plt, conf_plt])
+    #Bin differences
+    ax1.set_ylabel("Conditional Expectation")
+    ax1.set_ylim([-0.05, 1.05]) #respective to bin range
+    ax1.legend(loc="lower right")
+    ax1.set_title("Reliability Diagram")
+    #Bin frequencies
+    ax2.set_xlabel("Confidence")
+    ax2.set_ylabel("Count")
+    ax2.legend(loc="upper left")#, ncol=2
+    plt.tight_layout()
+    return fig
+def compute_and_plot(data, n_bins, bin_range, scheme, proxy, p):
+    # DEV: check on invalid datatypes with better warnings
+    if isinstance(data, pd.DataFrame):
+        data.dropna(inplace=True)
+    predictions = [
+        ast.literal_eval(prediction) if not isinstance(prediction, list) else prediction
+        for prediction in data["predictions"]
+    ]
+    references = [reference for reference in data["references"]]
+    results = metric._compute(
+        predictions,
+        references,
+        n_bins=n_bins,
+        # bin_range=None,#not needed
+        scheme=scheme,
+        proxy=proxy,
+        p=p,
+        detail=True,
+    )
+    plot = reliability_plot(results)
+    return results["ECE"], plt.gcf()
+outputs = [gr.outputs.Textbox(label="ECE"), gr.outputs.Plot(label="Reliability diagram")]
+iface = gr.Interface(
+    fn=compute_and_plot,
+    inputs=[component] + sliders,
+    outputs=outputs,
+    description=metric.info.description,
+    article=metric.info.citation,
+    # examples=sample_data
+)
+# ValueError: Examples argument must either be a directory or a nested list, where each sublist represents a set of inputs.
+iface.launch()
+# dict = {"ECE": ECE[0], "y_bar": ECE[1], "p_bar": ECE[2], "bin_freq": ECE[3]}
+# references=[0, 1, 2], predictions=)
+# https://gradio.app/getting_started/#multiple-inputs-and-outputs
+## fix with sliders for all kwargs
 """
 DEV: #might be nice to also plot reliability diagram
 have sliders for kwargs :)
 metric = ECE()
+"""

ece.py CHANGED Viewed

@@ -18,6 +18,8 @@
 import evaluate
 import datasets
 import numpy as np
 # TODO: Add BibTeX citation
@@ -161,7 +163,7 @@ def bin_calibrated_accuracy(bins, proxy="upper-edge"):
         return bins[:-1] + np.diff(bins) / 2
-def CE_estimate(y_correct, P, bins=None, p=1, proxy="upper-edge"):
     """
     y_correct: binary (N x 1)
     P: normalized (N x 1) either max or per class
@@ -187,6 +189,8 @@ def CE_estimate(y_correct, P, bins=None, p=1, proxy="upper-edge"):
     elif np.isinf(p):  # max-ECE
         CE = np.max(abs(empirical_acc[anindices] - calibrated_acc[anindices]))
     return CE
@@ -196,7 +200,10 @@ def top_1_CE(Y, P, **kwargs):
     bins = create_bins(
         n_bins=kwargs["n_bins"], bin_range=kwargs["bin_range"], scheme=kwargs["scheme"], P=p_max
     )
-    return CE_estimate(y_correct, p_max, bins=bins, proxy=kwargs["proxy"])
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
@@ -230,7 +237,14 @@ class ECE(evaluate.EvaluationModule):
         )
     def init_kwargs(
-        self, n_bins=10, bin_range=None, scheme="equal-range", proxy="upper-edge", p=1, **kwargs
     ):
         # super(evaluate.EvaluationModule, self).__init__(**kwargs)
         self.n_bins = n_bins
@@ -238,6 +252,7 @@ class ECE(evaluate.EvaluationModule):
         self.scheme = scheme
         self.proxy = proxy
         self.p = p
     def _compute(self, predictions, references, **kwargs):
@@ -266,6 +281,36 @@ class ECE(evaluate.EvaluationModule):
         """Returns the scores"""
         ECE = top_1_CE(references, predictions, **self.__dict__)
         return {
             "ECE": ECE,
         }

 import evaluate
 import datasets
 import numpy as np
+from typing import Dict, Optional
 # TODO: Add BibTeX citation
         return bins[:-1] + np.diff(bins) / 2
+def CE_estimate(y_correct, P, bins=None, p=1, proxy="upper-edge", detail=False):
     """
     y_correct: binary (N x 1)
     P: normalized (N x 1) either max or per class
     elif np.isinf(p):  # max-ECE
         CE = np.max(abs(empirical_acc[anindices] - calibrated_acc[anindices]))
+    if detail:
+        return CE, calibrated_acc, empirical_acc, weights_ece
     return CE
     bins = create_bins(
         n_bins=kwargs["n_bins"], bin_range=kwargs["bin_range"], scheme=kwargs["scheme"], P=p_max
     )
+    CE = CE_estimate(y_correct, p_max, bins=bins, proxy=kwargs["proxy"], detail=kwargs["detail"])
+    if self.detail:
+        return {"ECE": CE[0], "y_bar": CE[1], "p_bar": CE[2], "bin_freq": CE[3], "p_bar_cont": np.mean(p_max,-1), "accuracy": np.mean(y_correct)}
+    return CE
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
         )
     def init_kwargs(
+        self,
+        n_bins: int = 10,
+        bin_range: Optional[int] = [0, 1],
+        scheme: str = "equal-range",
+        proxy: str = "upper-edge",
+        p=1,
+        detail: bool = False,
+        **kwargs,
     ):
         # super(evaluate.EvaluationModule, self).__init__(**kwargs)
         self.n_bins = n_bins
         self.scheme = scheme
         self.proxy = proxy
         self.p = p
+        self.detail = detail
     def _compute(self, predictions, references, **kwargs):
         """Returns the scores"""
         ECE = top_1_CE(references, predictions, **self.__dict__)
+        if self.detail:
+            return ECE
         return {
             "ECE": ECE,
         }
+def test_ECE():
+    N = 10  # N evaluation instances {(x_i,y_i)}_{i=1}^N
+    K = 5  # K class problem
+    def random_mc_instance(concentration=1, onehot=False):
+        reference = np.argmax(
+            np.random.dirichlet(([concentration for _ in range(K)])), -1
+        )  # class targets
+        prediction = np.random.dirichlet(([concentration for _ in range(K)]))  # probabilities
+        if onehot:
+            reference = np.eye(K)[np.argmax(reference, -1)]
+        return reference, prediction
+    references, predictions = list(zip(*[random_mc_instance() for i in range(N)]))
+    references = np.array(references, dtype=np.int64)
+    predictions = np.array(predictions, dtype=np.float32)
+    res = ECE()._compute(predictions, references)
+    print(f"ECE: {res['ECE']}")
+    res = ECE()._compute(predictions, references, detail=True)
+    import pdb; pdb.set_trace()  # breakpoint 25274412 //
+    print(f"ECE: {res['ECE']}")
+if __name__ == '__main__':
+    test_ECE()

tests.py CHANGED Viewed

@@ -1,39 +1,12 @@
 import numpy as np
 test_cases = [
     {
-        "predictions": [0, 0],
-        "references": [1, 1],
-        "result": {"metric_score": 0}
     },
-    {
-        "predictions": [1, 1],
-        "references": [1, 1],
-        "result": {"metric_score": 1}
-    },
-    {
-        "predictions": [1, 0],
-        "references": [1, 1],
-        "result": {"metric_score": 0.5}
-    }
 ]
-def test_ECE():
-    N = 10  # N evaluation instances {(x_i,y_i)}_{i=1}^N
-    K = 5  # K class problem
-    def random_mc_instance(concentration=1, onehot=False):
-        reference = np.argmax(
-            np.random.dirichlet(([concentration for _ in range(K)])), -1
-        )  # class targets
-        prediction = np.random.dirichlet(([concentration for _ in range(K)]))  # probabilities
-        if onehot:
-            reference = np.eye(K)[np.argmax(reference, -1)]
-        return reference, prediction
-    references, predictions = list(zip(*[random_mc_instance() for i in range(N)]))
-    references = np.array(references, dtype=np.int64)
-    predictions = np.array(predictions, dtype=np.float32)
-    res = ECE()._compute(predictions, references)
-    print(f"ECE: {res['ECE']}")

 import numpy as np
 test_cases = [
+    {"predictions": [[0, 1], [1, 0]], "references": [1, 0], "result": {"ECE": 0}},
+    {"predictions": [[0, 1], [1, 0]], "references": [0, 1], "result": {"ECE": 1}},
     {
+        "predictions": [[0, 0.1, 0.9], [0.2, 0.8, 0]],
+        "references": [2, 0],  # kwargs?
+        "result": {"ECE": >0<1},
     },
 ]