Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on Mar 28, 2024

Commit

18db0da

verified ·

1 Parent(s): 7da4ddb

Upload metrics.py with huggingface_hub

Browse files

Files changed (1) hide show

metrics.py +76 -13

metrics.py CHANGED Viewed

@@ -16,7 +16,7 @@ from scipy.stats import bootstrap
 from scipy.stats._warnings_errors import DegenerateDataWarning
 from .artifact import Artifact
-from .dataclass import InternalField, OptionalField
 from .logging_utils import get_logger
 from .metric_utils import InstanceInput, MetricRequest, MetricResponse
 from .operator import (
@@ -58,6 +58,16 @@ def nan_mean(x):
         return np.nanmean(x)
 class UpdateStream(StreamInstanceOperator):
     update: dict
@@ -69,11 +79,7 @@ class UpdateStream(StreamInstanceOperator):
 class Metric(Artifact):
-    @property
-    @abstractmethod
-    def main_score(self):
-        pass
     # Override 'prediction_type' with the expected type of predictions
     # and references.  Example: "List[str]", "List[Dict]"", "string".
     # If left with default None, a warning will be displayed.
@@ -229,6 +235,18 @@ class MetricWithConfidenceInterval(Metric):
             [instance["score"]["instance"][score_name] for instance in instances]
         )
     @staticmethod
     def _all_instance_scores_equal(instances, score_name):
         instance_scores = [
@@ -625,13 +643,10 @@ class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
     # if subgroup_column is not None, a column by the specified name will be required in task_data
     subgroup_column = None
     implemented_reductions: List[str] = field(
-        default_factory=lambda: ["mean", "group_mean"]
     )
-    @property
-    @abstractmethod
-    def reduction_map(self) -> dict:
-        pass
     def _validate_group_mean_reduction(self, instances: List[dict]):
         """Ensure that group_mean reduction_map is properly formatted.
@@ -739,12 +754,19 @@ class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
             field_name_full_prefix = ""
             # used for passing to the bootstrapping, depends on whether the groups are fixed or not
-            aggregation_function = self.average_item_scores
             if reduction_type == "mean":
                 reduction_fields = list(set(reduction_params))
                 # no group reduction, so resample instances individually
                 scores_to_resample = instances
             elif reduction_type == "group_mean":
                 self._validate_group_mean_reduction(instances=instances)
                 reduction_fields = (
                     [self.main_score]
@@ -941,6 +963,12 @@ class Accuracy(InstanceMetric):
         return result
 class UnsortedListExactMatch(InstanceMetric):
     reduction_map = {"mean": ["unsorted_list_exact_match"]}
     main_score = "unsorted_list_exact_match"
@@ -988,7 +1016,15 @@ class MetricPipeline(MultiStreamOperator, Metric):
         self.metric.disable_confidence_interval_calculation()
     def verify(self):
-        assert self.main_score is not None, "main_score is not set"
     def prepare(self):
         super().prepare()
@@ -3266,3 +3302,30 @@ class BinaryMaxAccuracy(GlobalMetric):
                 best_thr = thr
         return {self.main_score: best_acc, "best_thr_max_acc": best_thr}

 from scipy.stats._warnings_errors import DegenerateDataWarning
 from .artifact import Artifact
+from .dataclass import AbstractField, InternalField, OptionalField
 from .logging_utils import get_logger
 from .metric_utils import InstanceInput, MetricRequest, MetricResponse
 from .operator import (
         return np.nanmean(x)
+def nan_max(x):
+    with warnings.catch_warnings():
+        # final mean should be mean of scores, ignoring NaN, hence nanmax
+        # but if the group function values is NaN for ALL values, nanmean throws a
+        # RuntimeWarning that it is calculating the mean of an empty slice (with no non-Nans)
+        # this is the desired behavior, but we want to avoid the warning here
+        warnings.simplefilter("ignore", category=RuntimeWarning)
+        return np.nanmax(x)
 class UpdateStream(StreamInstanceOperator):
     update: dict
 class Metric(Artifact):
+    main_score: str = AbstractField()
     # Override 'prediction_type' with the expected type of predictions
     # and references.  Example: "List[str]", "List[Dict]"", "string".
     # If left with default None, a warning will be displayed.
             [instance["score"]["instance"][score_name] for instance in instances]
         )
+    @staticmethod
+    def max_item_scores(instances: List[dict], score_name: str):
+        """Calculate max of a set of instance scores (given by score_name), omitting NaN values.
+        Args:
+            instances: list of dicts of each instance's instance scores.
+            score_name: score field names to compute the mean for.
+        """
+        return nan_max(
+            [instance["score"]["instance"][score_name] for instance in instances]
+        )
     @staticmethod
     def _all_instance_scores_equal(instances, score_name):
         instance_scores = [
     # if subgroup_column is not None, a column by the specified name will be required in task_data
     subgroup_column = None
     implemented_reductions: List[str] = field(
+        default_factory=lambda: ["mean", "group_mean", "max"]
     )
+    reduction_map: Dict[str, List[str]] = AbstractField()
     def _validate_group_mean_reduction(self, instances: List[dict]):
         """Ensure that group_mean reduction_map is properly formatted.
             field_name_full_prefix = ""
             # used for passing to the bootstrapping, depends on whether the groups are fixed or not
+            aggregation_function = None
             if reduction_type == "mean":
+                aggregation_function = self.average_item_scores
+                reduction_fields = list(set(reduction_params))
+                # no group reduction, so resample instances individually
+                scores_to_resample = instances
+            elif reduction_type == "max":
+                aggregation_function = self.max_item_scores
                 reduction_fields = list(set(reduction_params))
                 # no group reduction, so resample instances individually
                 scores_to_resample = instances
             elif reduction_type == "group_mean":
+                aggregation_function = self.average_item_scores
                 self._validate_group_mean_reduction(instances=instances)
                 reduction_fields = (
                     [self.main_score]
         return result
+class MaxAccuracy(Accuracy):
+    """Calculate the maximal accuracy over all instances as the global score."""
+    reduction_map = {"max": ["accuracy"]}
 class UnsortedListExactMatch(InstanceMetric):
     reduction_map = {"mean": ["unsorted_list_exact_match"]}
     main_score = "unsorted_list_exact_match"
         self.metric.disable_confidence_interval_calculation()
     def verify(self):
+        assert (
+            self.metric is not None
+        ), f"'metric' is not set in {self.get_metric_name()}"
+        assert (
+            self.main_score is not None
+        ), f"'main_score' is not set in {self.get_metric_name()}"
+        assert isinstance(
+            self.metric, Metric
+        ), f"'metric' is not set to a Metric class in {self.get_metric_name()} (type{self.metric})"
     def prepare(self):
         super().prepare()
                 best_thr = thr
         return {self.main_score: best_acc, "best_thr_max_acc": best_thr}
+KO_ERROR_MESSAGE = """
+Additional dependencies required. To install them, run:
+`pip install "sacrebleu[ko]"`.
+For MacOS: If error on 'mecab-config' show up during installation ], one should run:
+`brew install mecab`
+`pip install "sacrebleu[ko]"`
+"""
+class NormalizedSacrebleu(HuggingfaceMetric):
+    hf_metric_name = "sacrebleu"
+    hf_main_score = "score"
+    prediction_type = "str"
+    main_score = "sacrebleu"
+    scale = 100.0
+    scaled_fields = ["sacrebleu", "precisions"]
+    hf_additional_input_fields_pass_one_value = ["tokenize"]
+    _requirements_list = {
+        "mecab_ko": KO_ERROR_MESSAGE,
+        "mecab_ko_dic": KO_ERROR_MESSAGE,
+    }