Spaces:

aauss
/

test_of_time_accuracy

Sleeping

App Files Files Community

aauss commited on Jan 20

Commit

0f334f5

1 Parent(s): cac0344

Improve typing, early input checks and format code.

Browse files

Files changed (3) hide show

app.py +1 -1
test_of_time_accuracy.py +29 -15
tests/test_arithmetic_type_casting.py +1 -0

app.py CHANGED Viewed

@@ -3,4 +3,4 @@ from evaluate.utils import launch_gradio_widget
 module = evaluate.load("aauss/test_of_time_accuracy")
-launch_gradio_widget(module)


3
4
5	module = evaluate.load("aauss/test_of_time_accuracy")
6	+ launch_gradio_widget(module)

test_of_time_accuracy.py CHANGED Viewed

@@ -20,24 +20,25 @@ from typing import Any, Literal, TypedDict
 import datasets
 import evaluate
-# Field names used throughout the metric
 FIELD_EXPLANATION = "explanation"
 FIELD_ANSWER = "answer"
 FIELD_AGE = "age"
 FIELD_ORDERED_LIST = "ordered_list"
 FIELD_UNORDERED_LIST = "unordered_list"
-# Subset names
 SUBSET_ARITHMETIC = "arithmetic"
 SUBSET_SEMANTIC = "semantic"
 VALID_SUBSETS = frozenset({SUBSET_ARITHMETIC, SUBSET_SEMANTIC})
 # Control character escape mappings for JSON string normalization
-CONTROL_CHAR_ESCAPES = {'\n': '\\n', '\r': '\\r', '\t': '\\t'}
 class AccuracyResult(TypedDict):
-    accuracy: float | list[bool]
 _CITATION = """\
 @InProceedings{huggingface:module,
@@ -58,9 +59,9 @@ Args:
     predictions: list of predictions to score. Each prediction should be a string that contains a JSON object (e.g., generated by an LLM).
     references: list of reference answers.
     subset: The subset of the benchmark being evaluated. Must be one of "arithmetic" or "semantic".
-    return_average: If True, returns the average accuracy. If False, returns a list of boolean scores (correct/incorrect) for each sample. Defaults to True.
 Returns:
-    accuracy: The accuracy score (0.0 to 1.0) if return_average=True, or a list of booleans indicating correctness per sample if return_average=False.
 Examples:
     >>> import evaluate
     >>> metric = evaluate.load("aauss/test_of_time_accuracy")
@@ -122,7 +123,7 @@ class TestOfTimeAccuracy(evaluate.Metric):
         decoder = json.JSONDecoder()
         idx = 0
         while idx < len(text):
-            if text[idx] == '{':
                 try:
                     obj, _ = decoder.raw_decode(text, idx)
                     if isinstance(obj, dict):
@@ -145,7 +146,7 @@ class TestOfTimeAccuracy(evaluate.Metric):
         i = 0
         while i < len(text):
             char = text[i]
-            if char == '\\' and in_string and i + 1 < len(text):
                 # Preserve existing escape sequences
                 result.append(char)
                 result.append(text[i + 1])
@@ -158,7 +159,7 @@ class TestOfTimeAccuracy(evaluate.Metric):
             else:
                 result.append(char)
             i += 1
-        return ''.join(result)
     @staticmethod
     def _parse_reference_label(label_str: str) -> dict | None:
@@ -297,7 +298,9 @@ class TestOfTimeAccuracy(evaluate.Metric):
         # Process list fields regardless of key order
         for key in (FIELD_ORDERED_LIST, FIELD_UNORDERED_LIST):
             if key in data and isinstance(data[key], list):
-                data[key] = [item.lower() for item in data[key] if isinstance(item, str)]
         return data
@@ -413,7 +416,7 @@ class TestOfTimeAccuracy(evaluate.Metric):
             # Semantic references are used as-is
             return raw_references
-    def _compare_pair(self, prediction: Any, reference: Any, subset: str) -> bool:
         """
         Compares a single prediction-reference pair.
@@ -423,7 +426,7 @@ class TestOfTimeAccuracy(evaluate.Metric):
             subset: Either 'arithmetic' or 'semantic'
         Returns:
-            True if prediction matches reference, False otherwise
         """
         if subset == SUBSET_ARITHMETIC:
             prediction, reference = self._process_arithmetic_prediction(
@@ -434,13 +437,13 @@ class TestOfTimeAccuracy(evaluate.Metric):
                 prediction, reference
             )
-        return prediction == reference
     def _compute(
         self,
         predictions: list[str],
         references: list[str],
-        subset: Literal["arithmetic", "semantic"],
         return_average: bool = True,
     ) -> AccuracyResult:
         """
@@ -456,11 +459,22 @@ class TestOfTimeAccuracy(evaluate.Metric):
         Returns:
             Dictionary with 'accuracy' key containing either:
             - float: average accuracy (if return_average=True)
-            - list[bool]: per-sample correctness (if return_average=False)
         Raises:
             ValueError: If subset is not 'arithmetic' or 'semantic'
         """
         # Validate subset
         if subset not in VALID_SUBSETS:
             raise ValueError(

 import datasets
 import evaluate
 FIELD_EXPLANATION = "explanation"
 FIELD_ANSWER = "answer"
 FIELD_AGE = "age"
 FIELD_ORDERED_LIST = "ordered_list"
 FIELD_UNORDERED_LIST = "unordered_list"
 SUBSET_ARITHMETIC = "arithmetic"
 SUBSET_SEMANTIC = "semantic"
 VALID_SUBSETS = frozenset({SUBSET_ARITHMETIC, SUBSET_SEMANTIC})
+SubsetType = Literal["arithmetic", "semantic"]
 # Control character escape mappings for JSON string normalization
+CONTROL_CHAR_ESCAPES = {"\n": "\\n", "\r": "\\r", "\t": "\\t"}
 class AccuracyResult(TypedDict):
+    accuracy: float | list[int]
 _CITATION = """\
 @InProceedings{huggingface:module,
     predictions: list of predictions to score. Each prediction should be a string that contains a JSON object (e.g., generated by an LLM).
     references: list of reference answers.
     subset: The subset of the benchmark being evaluated. Must be one of "arithmetic" or "semantic".
+    return_average: If True, returns the average accuracy. If False, returns a list of int scores (0 or 1) for each sample. Defaults to True.
 Returns:
+    accuracy: The accuracy score (0.0 to 1.0) if return_average=True, or a list of int (0 or 1) indicating correctness per sample if return_average=False.
 Examples:
     >>> import evaluate
     >>> metric = evaluate.load("aauss/test_of_time_accuracy")
         decoder = json.JSONDecoder()
         idx = 0
         while idx < len(text):
+            if text[idx] == "{":
                 try:
                     obj, _ = decoder.raw_decode(text, idx)
                     if isinstance(obj, dict):
         i = 0
         while i < len(text):
             char = text[i]
+            if char == "\\" and in_string and i + 1 < len(text):
                 # Preserve existing escape sequences
                 result.append(char)
                 result.append(text[i + 1])
             else:
                 result.append(char)
             i += 1
+        return "".join(result)
     @staticmethod
     def _parse_reference_label(label_str: str) -> dict | None:
         # Process list fields regardless of key order
         for key in (FIELD_ORDERED_LIST, FIELD_UNORDERED_LIST):
             if key in data and isinstance(data[key], list):
+                data[key] = [
+                    item.lower() for item in data[key] if isinstance(item, str)
+                ]
         return data
             # Semantic references are used as-is
             return raw_references
+    def _compare_pair(self, prediction: Any, reference: Any, subset: str) -> int:
         """
         Compares a single prediction-reference pair.
             subset: Either 'arithmetic' or 'semantic'
         Returns:
+            1 if prediction matches reference, 0 otherwise
         """
         if subset == SUBSET_ARITHMETIC:
             prediction, reference = self._process_arithmetic_prediction(
                 prediction, reference
             )
+        return int(prediction == reference)
     def _compute(
         self,
         predictions: list[str],
         references: list[str],
+        subset: SubsetType,
         return_average: bool = True,
     ) -> AccuracyResult:
         """
         Returns:
             Dictionary with 'accuracy' key containing either:
             - float: average accuracy (if return_average=True)
+            - list[int]: per-sample correctness (if return_average=False)
         Raises:
             ValueError: If subset is not 'arithmetic' or 'semantic'
+            ValueError: If predictions is empty
+            ValueError: If predictions and references have different lengths
         """
+        # Validate inputs
+        if not predictions:
+            raise ValueError("predictions cannot be empty")
+        if len(predictions) != len(references):
+            raise ValueError(
+                f"predictions and references must have same length, "
+                f"got {len(predictions)} and {len(references)}"
+            )
         # Validate subset
         if subset not in VALID_SUBSETS:
             raise ValueError(

tests/test_arithmetic_type_casting.py CHANGED Viewed

@@ -86,6 +86,7 @@ def test_ordered_list_type_casting():
         pred_cast = TestOfTimeAccuracy._cast_prediction_to_reference_types(ref, pred)
         assert ref == pred_cast
 # TODO: Check if I should treat float strings differently, e.g., int(float("18.0"))
 def test_abc_type_casting():
     references_abc_keys = [

         pred_cast = TestOfTimeAccuracy._cast_prediction_to_reference_types(ref, pred)
         assert ref == pred_cast
 # TODO: Check if I should treat float strings differently, e.g., int(float("18.0"))
 def test_abc_type_casting():
     references_abc_keys = [