Spaces:
Runtime error
Runtime error
Martin Dočekal commited on
Commit ·
d0c77e3
1
Parent(s): 8d943be
initial commit
Browse files- .gitattributes +0 -35
- README.md +44 -3
- app.py +6 -0
- precision_recall_fscore_accuracy.py +131 -0
- requirements.txt +3 -0
- tests.py +97 -0
.gitattributes
DELETED
|
@@ -1,35 +0,0 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
|
@@ -1,12 +1,53 @@
|
|
| 1 |
---
|
| 2 |
title: Precision Recall Fscore Accuracy
|
| 3 |
-
|
| 4 |
-
|
|
|
|
|
|
|
| 5 |
colorTo: green
|
|
|
|
|
|
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.23.1
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
title: Precision Recall Fscore Accuracy
|
| 3 |
+
tags:
|
| 4 |
+
- evaluate
|
| 5 |
+
- metric
|
| 6 |
+
colorFrom: gray
|
| 7 |
colorTo: green
|
| 8 |
+
description: >-
|
| 9 |
+
This metric calculates precision, recall, accuracy, and fscore for classification tasks using scikit-learn.
|
| 10 |
sdk: gradio
|
| 11 |
sdk_version: 5.23.1
|
| 12 |
app_file: app.py
|
| 13 |
pinned: false
|
| 14 |
+
datasets: []
|
| 15 |
---
|
| 16 |
|
| 17 |
+
# Metric Card for Precision Recall Accuracy Fscore
|
| 18 |
+
This metric calculates precision, recall, accuracy, and fscore for classification tasks using scikit-learn.
|
| 19 |
+
|
| 20 |
+
## How to Use
|
| 21 |
+
|
| 22 |
+
>>> predictions = [0, 1, 0, 1]
|
| 23 |
+
>>> references = [1, 1, 0, 0]
|
| 24 |
+
>>> metric = evaluate.load("precision_recall_fscore_accuracy", average="binary")
|
| 25 |
+
>>> metric.compute(predictions=predictions, references=references)
|
| 26 |
+
{'precision': 0.5, 'recall': 0.5, 'fscore': 0.5, 'accuracy': 0.5}
|
| 27 |
+
|
| 28 |
+
## Inputs
|
| 29 |
+
- **predictions** (List of int|str): List of predicted labels.
|
| 30 |
+
- **references** (List of int|str): List of true labels.
|
| 31 |
+
|
| 32 |
+
## Outputs
|
| 33 |
+
Dictionary containing the following metrics:
|
| 34 |
+
- **precision** (float): Precision score.
|
| 35 |
+
- **recall** (float): Recall score.
|
| 36 |
+
- **fscore** (float): F1 score.
|
| 37 |
+
- **accuracy** (float): Accuracy score.
|
| 38 |
+
|
| 39 |
+
## Citation
|
| 40 |
+
```bibtex
|
| 41 |
+
@article{scikit-learn,
|
| 42 |
+
title={Scikit-learn: Machine Learning in {P}ython},
|
| 43 |
+
author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
|
| 44 |
+
and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
|
| 45 |
+
and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
|
| 46 |
+
Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
|
| 47 |
+
journal={Journal of Machine Learning Research},
|
| 48 |
+
volume={12},
|
| 49 |
+
pages={2825--2830},
|
| 50 |
+
year={2011}
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
```
|
app.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import evaluate
|
| 2 |
+
from evaluate.utils import launch_gradio_widget
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
module = evaluate.load("mdocekal/precision_recall_accuracy_fscore")
|
| 6 |
+
launch_gradio_widget(module)
|
precision_recall_fscore_accuracy.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections import Counter
|
| 2 |
+
from typing import Optional, Union
|
| 3 |
+
|
| 4 |
+
import evaluate
|
| 5 |
+
import datasets
|
| 6 |
+
|
| 7 |
+
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
_CITATION = """
|
| 11 |
+
@article{scikit-learn,
|
| 12 |
+
title={Scikit-learn: Machine Learning in {P}ython},
|
| 13 |
+
author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
|
| 14 |
+
and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
|
| 15 |
+
and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
|
| 16 |
+
Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
|
| 17 |
+
journal={Journal of Machine Learning Research},
|
| 18 |
+
volume={12},
|
| 19 |
+
pages={2825--2830},
|
| 20 |
+
year={2011}
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
_DESCRIPTION = """\
|
| 26 |
+
This metric calculates precision, recall, accuracy, and fscore for classification tasks using scikit-learn.
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
_KWARGS_DESCRIPTION = """
|
| 30 |
+
Args:
|
| 31 |
+
predictions: list or numpy array of predicted labels.
|
| 32 |
+
references: list or numpy array of true labels.
|
| 33 |
+
average (str, optional): Type of averaging performed on the data.
|
| 34 |
+
This parameter is required for multiclass/multilabel targets.
|
| 35 |
+
If ``None``, the metrics for each class are returned. Otherwise, this
|
| 36 |
+
determines the type of averaging performed on the data:
|
| 37 |
+
|
| 38 |
+
``'binary'``:
|
| 39 |
+
Only report results for the class specified by ``pos_label``.
|
| 40 |
+
This is applicable only if targets (``y_{true,pred}``) are binary.
|
| 41 |
+
``'micro'``:
|
| 42 |
+
Calculate metrics globally by counting the total true positives,
|
| 43 |
+
false negatives and false positives.
|
| 44 |
+
``'macro'``:
|
| 45 |
+
Calculate metrics for each label, and find their unweighted
|
| 46 |
+
mean. This does not take label imbalance into account.
|
| 47 |
+
``'weighted'``:
|
| 48 |
+
Calculate metrics for each label, and find their average weighted
|
| 49 |
+
by support (the number of true instances for each label). This
|
| 50 |
+
alters 'macro' to account for label imbalance; it can result in an
|
| 51 |
+
F-score that is not between precision and recall.
|
| 52 |
+
``'samples'``:
|
| 53 |
+
Calculate metrics for each instance, and find their average (only
|
| 54 |
+
meaningful for multilabel classification where this differs from
|
| 55 |
+
:func:`accuracy_score`).
|
| 56 |
+
zero_division (int or str, optional): default="warn"
|
| 57 |
+
Sets the value to return when there is a zero division:
|
| 58 |
+
|
| 59 |
+
- recall: when there are no positive labels
|
| 60 |
+
- precision: when there are no positive predictions
|
| 61 |
+
- f-score: both
|
| 62 |
+
|
| 63 |
+
Notes:
|
| 64 |
+
|
| 65 |
+
- If set to "warn", this acts like 0, but a warning is also raised.
|
| 66 |
+
- If set to `np.nan`, such values will be excluded from the average.
|
| 67 |
+
|
| 68 |
+
.. versionadded:: 1.3
|
| 69 |
+
`np.nan` option was added.
|
| 70 |
+
|
| 71 |
+
Returns:
|
| 72 |
+
A dictionary with the following keys:
|
| 73 |
+
- precision: Precision score.
|
| 74 |
+
- recall: Recall score.
|
| 75 |
+
- f1: F1-score.
|
| 76 |
+
- accuracy: Accuracy score.
|
| 77 |
+
|
| 78 |
+
Examples:
|
| 79 |
+
>>> predictions = [0, 1, 0, 1]
|
| 80 |
+
>>> references = [1, 1, 0, 0]
|
| 81 |
+
>>> metric = evaluate.load("precision_recall_fscore_accuracy", average="binary")
|
| 82 |
+
>>> metric.compute(predictions=predictions, references=references)
|
| 83 |
+
{'precision': 0.5, 'recall': 0.5, 'fscore': 0.5, 'accuracy': 0.5}
|
| 84 |
+
|
| 85 |
+
"""
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
| 89 |
+
class PrecisionRecallFscoreAccuracy(evaluate.Metric):
|
| 90 |
+
"""
|
| 91 |
+
Implementation of example based evaluation metrics for multi-label classification presented in Zhang and Zhou (2014).
|
| 92 |
+
"""
|
| 93 |
+
|
| 94 |
+
def __init__(self, *args, **kwargs):
|
| 95 |
+
super().__init__(*args, **kwargs)
|
| 96 |
+
self.beta = kwargs.get("beta", 1.0)
|
| 97 |
+
self.average = kwargs.get("average", None)
|
| 98 |
+
self.zero_division = kwargs.get("zero_division", "warn")
|
| 99 |
+
|
| 100 |
+
def _info(self):
|
| 101 |
+
return evaluate.MetricInfo(
|
| 102 |
+
# This is the description that will appear on the modules page.
|
| 103 |
+
module_type="metric",
|
| 104 |
+
description=_DESCRIPTION,
|
| 105 |
+
citation=_CITATION,
|
| 106 |
+
inputs_description=_KWARGS_DESCRIPTION,
|
| 107 |
+
# This defines the format of each prediction and reference
|
| 108 |
+
features=[
|
| 109 |
+
datasets.Features({
|
| 110 |
+
'predictions': datasets.Value('int64'),
|
| 111 |
+
'references': datasets.Value('int64'),
|
| 112 |
+
}),
|
| 113 |
+
datasets.Features({
|
| 114 |
+
'predictions': datasets.Value('string'),
|
| 115 |
+
'references': datasets.Value('string'),
|
| 116 |
+
}),
|
| 117 |
+
]
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
def _compute(self, predictions: list[Union[int,str]], references: list[Union[int,str]]):
|
| 121 |
+
precision, recall, f1, _ = precision_recall_fscore_support(
|
| 122 |
+
references, predictions, average=self.average, zero_division=self.zero_division
|
| 123 |
+
)
|
| 124 |
+
accuracy = accuracy_score(references, predictions)
|
| 125 |
+
return {
|
| 126 |
+
"precision": precision,
|
| 127 |
+
"recall": recall,
|
| 128 |
+
"fscore": f1,
|
| 129 |
+
"accuracy": accuracy,
|
| 130 |
+
}
|
| 131 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
evaluate
|
| 2 |
+
datasets
|
| 3 |
+
scikit-learn
|
tests.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from unittest import TestCase
|
| 2 |
+
|
| 3 |
+
from precision_recall_fscore_accuracy import PrecisionRecallFscoreAccuracy
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class PrecisionRecallFscoreAccuracyTestBinary(TestCase):
|
| 7 |
+
"""
|
| 8 |
+
All of these tests are also used for multiset configuration. So please mind this and write the test in a way that
|
| 9 |
+
it is valid for both configurations (do not use same label multiple times).
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
def setUp(self):
|
| 13 |
+
self.metric = PrecisionRecallFscoreAccuracy(average="binary")
|
| 14 |
+
|
| 15 |
+
def test_eok(self):
|
| 16 |
+
self.assertDictEqual(
|
| 17 |
+
{
|
| 18 |
+
"precision": 1.0,
|
| 19 |
+
"recall": 1.0,
|
| 20 |
+
"fscore": 1.0,
|
| 21 |
+
"accuracy": 1.0,
|
| 22 |
+
},
|
| 23 |
+
self.metric.compute(
|
| 24 |
+
predictions=[0, 1, 0],
|
| 25 |
+
references=[0, 1, 0]
|
| 26 |
+
)
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
def test_eok_string(self):
|
| 30 |
+
self.assertDictEqual(
|
| 31 |
+
{
|
| 32 |
+
"precision": 1.0,
|
| 33 |
+
"recall": 1.0,
|
| 34 |
+
"accuracy": 1.0,
|
| 35 |
+
"fscore": 1.0
|
| 36 |
+
},
|
| 37 |
+
self.metric.compute(
|
| 38 |
+
predictions=["0", "1", "0"],
|
| 39 |
+
references=["0", "1", "0"]
|
| 40 |
+
)
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
def test_completely_different(self):
|
| 44 |
+
self.assertDictEqual(
|
| 45 |
+
{
|
| 46 |
+
"precision": 0.0,
|
| 47 |
+
"recall": 0.0,
|
| 48 |
+
"accuracy": 0.0,
|
| 49 |
+
"fscore": 0.0
|
| 50 |
+
},
|
| 51 |
+
self.metric.compute(
|
| 52 |
+
predictions=[0, 1, 0],
|
| 53 |
+
references=[1, 0, 1]
|
| 54 |
+
)
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
def test_max_precision(self):
|
| 58 |
+
self.assertDictEqual(
|
| 59 |
+
{
|
| 60 |
+
"precision": 1.0,
|
| 61 |
+
"recall": 0.5,
|
| 62 |
+
"accuracy": 0.5,
|
| 63 |
+
"fscore": 2 / 3
|
| 64 |
+
},
|
| 65 |
+
self.metric.compute(
|
| 66 |
+
predictions=[0, 1],
|
| 67 |
+
references=[1, 1]
|
| 68 |
+
)
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
def test_max_recall(self):
|
| 72 |
+
self.assertDictEqual(
|
| 73 |
+
{
|
| 74 |
+
"precision": 0.5,
|
| 75 |
+
"recall": 1.0,
|
| 76 |
+
"accuracy": 0.5,
|
| 77 |
+
"fscore": 2 / 3
|
| 78 |
+
},
|
| 79 |
+
self.metric.compute(
|
| 80 |
+
predictions=[1, 1],
|
| 81 |
+
references=[1, 0]
|
| 82 |
+
)
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
def test_partial_match(self):
|
| 86 |
+
self.assertDictEqual(
|
| 87 |
+
{
|
| 88 |
+
"precision": 0.5,
|
| 89 |
+
"recall": 0.5,
|
| 90 |
+
"accuracy": 0.5,
|
| 91 |
+
"fscore": 0.5
|
| 92 |
+
},
|
| 93 |
+
self.metric.compute(
|
| 94 |
+
predictions=[0, 1, 0, 1],
|
| 95 |
+
references=[1, 1, 0, 0]
|
| 96 |
+
)
|
| 97 |
+
)
|