| from fixed_f1 import FixedF1 | |
| from fixed_precision import FixedPrecision | |
| from fixed_recall import FixedRecall | |
| import evaluate | |
| import gradio as gr | |
| import pandas as pd | |
| import numpy as np | |
| title = "'Combine' multiple metrics with this 🤗 Evaluate 🪲 Fix!" | |
| description = """<p style='text-align: center'> | |
| As I introduce myself to the entirety of the 🤗 ecosystem, I've put together this Space to show off a temporary fix for a current 🪲 in the 🤗 Evaluate library. \n | |
| Check out the original, longstanding issue [here](https://github.com/huggingface/evaluate/issues/234). This details how it is currently impossible to \ | |
| `evaluate.combine()` multiple metrics related to multilabel text classification. Particularly, one cannot `combine` the `f1`, `precision`, and `recall` scores for \ | |
| evaluation. I encountered this issue specifically while training [RoBERTa-base-DReiFT](https://huggingface.co/MarioBarbeque/RoBERTa-base-DReiFT) for multilabel \ | |
| text classification of 805 labeled medical conditions based on drug reviews. The [following workaround](https://github.com/johngrahamreynolds/FixedMetricsForHF) was | |
| created to address this - follow the link to view the source! To see each of these abstracted classes at work independently, view the 🤗 Space I've constructed for each: | |
| [FixedF1📈](https://huggingface.co/spaces/MarioBarbeque/FixedF1), [FixedPrecision🎯](https://huggingface.co/spaces/MarioBarbeque/FixedPrecision), | |
| [FixedRecall📉](https://huggingface.co/spaces/MarioBarbeque/FixedRecall).\n | |
| This Space shows how one can instantiate these custom `evaluate.Metric`s, each with their own unique methodology for averaging across labels, before `combine`-ing them into a | |
| HF `evaluate.CombinedEvaluations` object. From here, we can easily compute each of the metrics simultaneously using `compute`. \n | |
| In general, one writes the following:\n | |
| ```python | |
| f1 = FixedF1(average=...) | |
| precision = FixedPrecision(average=...) | |
| recall = FixedRecall(average=...) | |
| combined = evaluate.combine([f1, precision, recall]) | |
| combined.add_batch(predictions=..., references=...) | |
| combined.compute() | |
| ```\n | |
| where the `average` parameter can be different at instantiation time for each of the metrics. Acceptable values include `[None, 'micro', 'macro', 'weighted']` ( | |
| or `binary` if there exist only two labels). \n | |
| Try it out using the examples below! Then try picking some various averaging methods yourself! | |
| </p> | |
| """ | |
| def populate_map(metric_df: pd.DataFrame, metric_set: set) -> dict: | |
| metric_map = dict() | |
| for key in metric_set: | |
| for val in metric_df.loc[metric_df["Metric"] == key]["Averaging Method"]: | |
| metric_map[key] = val | |
| return metric_map | |
| def evaluation(predictions_df: pd.DataFrame, metrics_df: pd.DataFrame) -> str: | |
| metric_set = set(metrics_df["Metric"].to_list()) | |
| metric_map = populate_map(metrics_df, metric_set) | |
| combined_list = [] | |
| if "f1" in metric_set: | |
| f1 = FixedF1(average=metric_map["f1"] if metric_map["f1"] != "None" else None) | |
| combined_list.append(f1) | |
| if "precision" in metric_set: | |
| precision = FixedPrecision(average=metric_map["precision"] if metric_map["precision"] != "None" else None, zero_division=np.nan) | |
| combined_list.append(precision) | |
| if "recall" in metric_set: | |
| recall = FixedRecall(average=metric_map["recall"] if metric_map["recall"] != "None" else None) | |
| combined_list.append(recall) | |
| combined = evaluate.combine(combined_list) | |
| predicted = [int(num) for num in predictions_df["Predicted Class Label"].to_list()] | |
| references = [int(num) for num in predictions_df["Actual Class Label"].to_list()] | |
| combined.add_batch(predictions=predicted, references=references) | |
| outputs = combined.compute() | |
| return f"Your metrics are as follows: \n {outputs}" | |
| space = gr.Interface( | |
| fn=evaluation, | |
| inputs=[ | |
| gr.Dataframe( | |
| headers=["Predicted Class Label", "Actual Class Label"], | |
| datatype=["number", "number"], | |
| row_count=5, | |
| col_count=(2, "fixed"), | |
| label="Table of Predicted vs Actual Class Labels" | |
| ), | |
| gr.Dataframe( | |
| headers=["Metric", "Averaging Method"], | |
| datatype=["str", "str"], | |
| row_count=(3, "fixed"), | |
| col_count=(2, "fixed"), | |
| label="Table of Metrics and Averaging Method across Labels " | |
| ) | |
| ], | |
| outputs="text", | |
| title=title, | |
| description=description, | |
| examples=[ | |
| [ | |
| pd.DataFrame(columns=["Predicted Class Label", "Actual Class Label"], data=[[0,1],[1,1],[2,2],[1,0],[0,0]]), | |
| pd.DataFrame(columns=["Metric", "Averaging Method"], data=[["f1", "weighted"],["precision", "micro"],["recall", "macro"]]) | |
| ] | |
| ], | |
| cache_examples=False | |
| ).launch() |