Spaces:
Build error
Build error
Commit
·
1a07572
1
Parent(s):
e368a57
include new metrics
Browse files- classification_evaluator.py +34 -15
classification_evaluator.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import evaluate
|
| 2 |
from datasets import Features, Value
|
| 3 |
-
from sklearn.metrics import accuracy_score
|
|
|
|
| 4 |
|
| 5 |
_CITATION = """
|
| 6 |
@article{scikit-learn,
|
|
@@ -17,13 +18,11 @@ _CITATION = """
|
|
| 17 |
"""
|
| 18 |
|
| 19 |
_DESCRIPTION = """
|
| 20 |
-
|
| 21 |
-
Accuracy
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
FP: False positive
|
| 26 |
-
FN: False negative
|
| 27 |
"""
|
| 28 |
|
| 29 |
_KWARGS_DESCRIPTION = """
|
|
@@ -32,8 +31,12 @@ Args:
|
|
| 32 |
references (`list` of `str`): Ground truth labels.
|
| 33 |
|
| 34 |
Returns:
|
| 35 |
-
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
"""
|
| 38 |
|
| 39 |
|
|
@@ -50,10 +53,26 @@ class ClassificationEvaluator(evaluate.Metric):
|
|
| 50 |
|
| 51 |
def _compute(self, predictions, references):
|
| 52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
return {
|
| 54 |
-
"accuracy":
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
)
|
|
|
|
|
|
|
|
|
|
| 59 |
}
|
|
|
|
| 1 |
import evaluate
|
| 2 |
from datasets import Features, Value
|
| 3 |
+
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
|
| 4 |
+
|
| 5 |
|
| 6 |
_CITATION = """
|
| 7 |
@article{scikit-learn,
|
|
|
|
| 18 |
"""
|
| 19 |
|
| 20 |
_DESCRIPTION = """
|
| 21 |
+
This evaluator computes multiple classification metrics to assess the performance of a model. Metrics calculated include:
|
| 22 |
+
- Accuracy: The proportion of correct predictions among the total number of cases processed. Computed as (TP + TN) / (TP + TN + FP + FN), where TP, TN, FP, and FN denote true positives, true negatives, false positives, and false negatives respectively.
|
| 23 |
+
- Precision, Recall, and F1-Score: Evaluated for each class individually as well as macro (average across classes) and micro (aggregate contributions of all classes) averages.
|
| 24 |
+
- Confusion Matrix: A matrix representing the classification accuracy for each class combination.
|
| 25 |
+
|
|
|
|
|
|
|
| 26 |
"""
|
| 27 |
|
| 28 |
_KWARGS_DESCRIPTION = """
|
|
|
|
| 31 |
references (`list` of `str`): Ground truth labels.
|
| 32 |
|
| 33 |
Returns:
|
| 34 |
+
Returns:
|
| 35 |
+
Dict containing:
|
| 36 |
+
accuracy (float): Proportion of correct predictions. Value ranges between 0 (worst) and 1 (best).
|
| 37 |
+
precision_macro (float), recall_macro (float), f1_macro (float): Macro averages of precision, recall, and F1-score respectively.
|
| 38 |
+
precision_micro (float), recall_micro (float), f1_micro (float): Micro averages of precision, recall, and F1-score respectively.
|
| 39 |
+
confusion_matrix (list of lists): 2D list representing the confusion matrix of the classification results.
|
| 40 |
"""
|
| 41 |
|
| 42 |
|
|
|
|
| 53 |
|
| 54 |
def _compute(self, predictions, references):
|
| 55 |
|
| 56 |
+
accuracy = accuracy_score(references, predictions, normalize=True, sample_weight=None)
|
| 57 |
+
|
| 58 |
+
# Calculate macro and micro averages for precision, recall, and F1-score
|
| 59 |
+
precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
|
| 60 |
+
references, predictions, average='macro'
|
| 61 |
+
)
|
| 62 |
+
precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(
|
| 63 |
+
references, predictions, average='micro'
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
# Calculate the confusion matrix
|
| 67 |
+
conf_matrix = confusion_matrix(references, predictions)
|
| 68 |
+
|
| 69 |
return {
|
| 70 |
+
"accuracy": accuracy,
|
| 71 |
+
"precision_macro": float(precision_macro),
|
| 72 |
+
"recall_macro": float(recall_macro),
|
| 73 |
+
"f1_macro": float(f1_macro),
|
| 74 |
+
"precision_micro": float(precision_micro),
|
| 75 |
+
"recall_micro": float(recall_micro),
|
| 76 |
+
"f1_micro": float(f1_micro),
|
| 77 |
+
"confusion_matrix": conf_matrix.tolist()
|
| 78 |
}
|