Spaces:

MarioBarbeque
/

CombinedEvaluationMetrics

Sleeping

App Files Files Community

CombinedEvaluationMetrics / app.py

MarioBarbeque

last update to space links

0f86724 verified about 1 year ago

raw

history blame contribute delete

4.83 kB

	from fixed_f1 import FixedF1
	from fixed_precision import FixedPrecision
	from fixed_recall import FixedRecall
	import evaluate
	import gradio as gr
	import pandas as pd
	import numpy as np

	title = "'Combine' multiple metrics with this 🤗 Evaluate 🪲 Fix!"

	description = """<p style='text-align: center'>
	As I introduce myself to the entirety of the 🤗 ecosystem, I've put together this Space to show off a temporary fix for a current 🪲 in the 🤗 Evaluate library. \n

	Check out the original, longstanding issue [here](https://github.com/huggingface/evaluate/issues/234). This details how it is currently impossible to \
	`evaluate.combine()` multiple metrics related to multilabel text classification. Particularly, one cannot `combine` the `f1`, `precision`, and `recall` scores for \
	evaluation. I encountered this issue specifically while training [RoBERTa-base-DReiFT](https://huggingface.co/MarioBarbeque/RoBERTa-base-DReiFT) for multilabel \
	text classification of 805 labeled medical conditions based on drug reviews. The [following workaround](https://github.com/johngrahamreynolds/FixedMetricsForHF) was
	created to address this - follow the link to view the source! To see each of these abstracted classes at work independently, view the 🤗 Space I've constructed for each:
	[FixedF1📈](https://huggingface.co/spaces/MarioBarbeque/FixedF1), [FixedPrecision🎯](https://huggingface.co/spaces/MarioBarbeque/FixedPrecision),
	[FixedRecall📉](https://huggingface.co/spaces/MarioBarbeque/FixedRecall).\n

	This Space shows how one can instantiate these custom `evaluate.Metric`s, each with their own unique methodology for averaging across labels, before `combine`-ing them into a
	HF `evaluate.CombinedEvaluations` object. From here, we can easily compute each of the metrics simultaneously using `compute`. \n

	In general, one writes the following:\n

	```python
	f1 = FixedF1(average=...)
	precision = FixedPrecision(average=...)
	recall = FixedRecall(average=...)

	combined = evaluate.combine([f1, precision, recall])

	combined.add_batch(predictions=..., references=...)
	combined.compute()
	```\n

	where the `average` parameter can be different at instantiation time for each of the metrics. Acceptable values include `[None, 'micro', 'macro', 'weighted']` (
	or `binary` if there exist only two labels). \n

	Try it out using the examples below! Then try picking some various averaging methods yourself!
	</p>
	"""


	def populate_map(metric_df: pd.DataFrame, metric_set: set) -> dict:

	metric_map = dict()

	for key in metric_set:
	for val in metric_df.loc[metric_df["Metric"] == key]["Averaging Method"]:
	metric_map[key] = val

	return metric_map


	def evaluation(predictions_df: pd.DataFrame, metrics_df: pd.DataFrame) -> str:

	metric_set = set(metrics_df["Metric"].to_list())
	metric_map = populate_map(metrics_df, metric_set)
	combined_list = []

	if "f1" in metric_set:
	f1 = FixedF1(average=metric_map["f1"] if metric_map["f1"] != "None" else None)
	combined_list.append(f1)
	if "precision" in metric_set:
	precision = FixedPrecision(average=metric_map["precision"] if metric_map["precision"] != "None" else None, zero_division=np.nan)
	combined_list.append(precision)
	if "recall" in metric_set:
	recall = FixedRecall(average=metric_map["recall"] if metric_map["recall"] != "None" else None)
	combined_list.append(recall)

	combined = evaluate.combine(combined_list)

	predicted = [int(num) for num in predictions_df["Predicted Class Label"].to_list()]
	references = [int(num) for num in predictions_df["Actual Class Label"].to_list()]

	combined.add_batch(predictions=predicted, references=references)
	outputs = combined.compute()

	return f"Your metrics are as follows: \n {outputs}"


	space = gr.Interface(
	fn=evaluation,
	inputs=[
	gr.Dataframe(
	headers=["Predicted Class Label", "Actual Class Label"],
	datatype=["number", "number"],
	row_count=5,
	col_count=(2, "fixed"),
	label="Table of Predicted vs Actual Class Labels"
	),
	gr.Dataframe(
	headers=["Metric", "Averaging Method"],
	datatype=["str", "str"],
	row_count=(3, "fixed"),
	col_count=(2, "fixed"),
	label="Table of Metrics and Averaging Method across Labels "
	)
	],
	outputs="text",
	title=title,
	description=description,
	examples=[
	[
	pd.DataFrame(columns=["Predicted Class Label", "Actual Class Label"], data=[[0,1],[1,1],[2,2],[1,0],[0,0]]),
	pd.DataFrame(columns=["Metric", "Averaging Method"], data=[["f1", "weighted"],["precision", "micro"],["recall", "macro"]])
	]
	],
	cache_examples=False
	).launch()