Spaces:

aauss
/

tram_accuracy

Sleeping

App Files Files Community

aauss commited on Jan 20

Commit

86953e8

1 Parent(s): 28a8c77

Add early input check, improve type hints and format code.

Browse files

Files changed (2) hide show

app.py +1 -1
tram_accuracy.py +24 -8

app.py CHANGED Viewed

@@ -3,4 +3,4 @@ from evaluate.utils import launch_gradio_widget
 module = evaluate.load("aauss/tram_accuracy")
-launch_gradio_widget(module)


3
4
5	module = evaluate.load("aauss/tram_accuracy")
6	+ launch_gradio_widget(module)

tram_accuracy.py CHANGED Viewed

@@ -14,8 +14,18 @@
 """Metric to calculate the accuracy for the TRAM benchmark by Wang et al. (2024)."""
 import re
-import evaluate
 import datasets
 _CITATION = """\
@@ -44,14 +54,14 @@ Returns:
 """
-TRAM_ANSWER_REGEX = re.compile(r"[Tt]he final answer is \(([A-D])\)")
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class TRAMAccuracy(evaluate.Metric):
     """Calculates the accuracy for the (multiple choice) TRAM datasets by extracting the final answer from the prediction and comparing it to the reference answer."""
-    def _info(self):
         return evaluate.MetricInfo(
             module_type="metric",
             description=_DESCRIPTION,
@@ -65,13 +75,20 @@ class TRAMAccuracy(evaluate.Metric):
                 }
             ),
             homepage="https://huggingface.co/spaces/aauss/tram_accuracy",
-            codebase_urls=["https://huggingface.co/spaces/aauss/tram_accuracy/tree/main"],
             reference_urls=["https://huggingface.co/datasets/Warrieryes/TRAM-Temporal"],
         )
-    def _compute(self, predictions, references, return_average=True):
         """Returns the accuracy for the (multiple choice) TRAM datasets."""
-        if len(predictions) == 0:
             raise ValueError("predictions cannot be empty")
         if len(predictions) != len(references):
             raise ValueError(
@@ -91,5 +108,4 @@ class TRAMAccuracy(evaluate.Metric):
         ]
         if return_average:
             return {"accuracy": sum(accuracy) / len(accuracy)}
-        else:
-            return {"accuracy": accuracy}

 """Metric to calculate the accuracy for the TRAM benchmark by Wang et al. (2024)."""
 import re
+from typing import TypedDict
 import datasets
+import evaluate
+VALID_ANSWER_CHOICES = frozenset({"A", "B", "C", "D"})
+TRAM_ANSWER_PATTERN = r"[Tt]he final answer is \(([A-D])\)"
+class AccuracyResult(TypedDict):
+    accuracy: float | list[int]
 _CITATION = """\
 """
+TRAM_ANSWER_REGEX = re.compile(TRAM_ANSWER_PATTERN)
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class TRAMAccuracy(evaluate.Metric):
     """Calculates the accuracy for the (multiple choice) TRAM datasets by extracting the final answer from the prediction and comparing it to the reference answer."""
+    def _info(self) -> evaluate.MetricInfo:
         return evaluate.MetricInfo(
             module_type="metric",
             description=_DESCRIPTION,
                 }
             ),
             homepage="https://huggingface.co/spaces/aauss/tram_accuracy",
+            codebase_urls=[
+                "https://huggingface.co/spaces/aauss/tram_accuracy/tree/main"
+            ],
             reference_urls=["https://huggingface.co/datasets/Warrieryes/TRAM-Temporal"],
         )
+    def _compute(
+        self,
+        predictions: list[str],
+        references: list[str],
+        return_average: bool = True,
+    ) -> AccuracyResult:
         """Returns the accuracy for the (multiple choice) TRAM datasets."""
+        if not predictions:
             raise ValueError("predictions cannot be empty")
         if len(predictions) != len(references):
             raise ValueError(
         ]
         if return_average:
             return {"accuracy": sum(accuracy) / len(accuracy)}
+        return {"accuracy": accuracy}