Spaces:

aauss
/

tram_accuracy

Sleeping

App Files Files Community

aauss commited on Jan 15

Commit

e6bd448

1 Parent(s): dbb13b7

Improve regex, add tests, and fail early with wrong input lengths.

Browse files

Files changed (4) hide show

.gitignore +2 -0
tests.py +0 -17
tests/test_metric.py +53 -0
tram_accuracy.py +9 -2

.gitignore CHANGED Viewed

@@ -8,3 +8,5 @@ wheels/
 # Virtual environments
 .venv

 # Virtual environments
 .venv
+.DS_Store

tests.py DELETED Viewed

@@ -1,17 +0,0 @@
-test_cases = [
-    {
-        "predictions": [0, 0],
-        "references": [1, 1],
-        "result": {"metric_score": 0}
-    },
-    {
-        "predictions": [1, 1],
-        "references": [1, 1],
-        "result": {"metric_score": 1}
-    },
-    {
-        "predictions": [1, 0],
-        "references": [1, 1],
-        "result": {"metric_score": 0.5}
-    }
-]

tests/test_metric.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from tram_accuracy import TRAMAccuracy
@@ -16,3 +17,55 @@ def test_tram_accuracy():
         "accuracy"
     ]
     assert accuracy == 1.0

+import pytest
 from tram_accuracy import TRAMAccuracy
         "accuracy"
     ]
     assert accuracy == 1.0
+def test_empty_predictions():
+    """Empty predictions should raise ValueError."""
+    with pytest.raises(ValueError, match="predictions cannot be empty"):
+        TRAMAccuracy()._compute(predictions=[], references=[])
+def test_mismatched_lengths():
+    """Mismatched lengths should raise ValueError."""
+    with pytest.raises(ValueError, match="must have same length"):
+        TRAMAccuracy()._compute(
+            predictions=["The final answer is (A)."],
+            references=["A", "B"],
+        )
+def test_no_regex_match():
+    """Predictions without the expected format should be marked incorrect."""
+    result = TRAMAccuracy()._compute(
+        predictions=["I think the answer is A", "The final answer is (B)."],
+        references=["A", "B"],
+        return_average=False,
+    )
+    assert result["accuracy"] == [0, 1]
+def test_partial_accuracy():
+    """Test partial accuracy calculation."""
+    result = TRAMAccuracy()._compute(
+        predictions=[
+            "The final answer is (A).",
+            "The final answer is (B).",
+            "The final answer is (C).",
+        ],
+        references=["A", "C", "C"],
+        return_average=True,
+    )
+    assert result["accuracy"] == pytest.approx(2 / 3)
+def test_case_variations():
+    """Both 'The' and 'the' should be matched."""
+    result = TRAMAccuracy()._compute(
+        predictions=[
+            "The final answer is (A).",
+            "the final answer is (B).",
+        ],
+        references=["A", "B"],
+        return_average=False,
+    )
+    assert result["accuracy"] == [1, 1]

tram_accuracy.py CHANGED Viewed

@@ -37,14 +37,14 @@ Args:
     predictions: list of predictions to score. Each prediction
         should be a string with the model's response, which contains the final answer.
     references: list of reference for each prediction. Each
-        reference a single letter respresenting the correct answer.
     return_average: whether to return the average accuracy or the accuracy for each prediction.
 Returns:
     accuracy: the accuracy for the TRAM datasets.
 """
-TRAM_ANSWER_REGEX = re.compile(r"[Tt]he final answer is .([A-D]).")
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
@@ -75,6 +75,13 @@ class TRAMAccuracy(evaluate.Metric):
     def _compute(self, predictions, references, return_average=True):
         """Returns the accuracy for the (multiple choice) TRAM datasets."""
         predictions_matches = [
             TRAM_ANSWER_REGEX.search(prediction) for prediction in predictions
         ]

     predictions: list of predictions to score. Each prediction
         should be a string with the model's response, which contains the final answer.
     references: list of reference for each prediction. Each
+        reference a single letter representing the correct answer.
     return_average: whether to return the average accuracy or the accuracy for each prediction.
 Returns:
     accuracy: the accuracy for the TRAM datasets.
 """
+TRAM_ANSWER_REGEX = re.compile(r"[Tt]he final answer is \(([A-D])\)")
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
     def _compute(self, predictions, references, return_average=True):
         """Returns the accuracy for the (multiple choice) TRAM datasets."""
+        if len(predictions) == 0:
+            raise ValueError("predictions cannot be empty")
+        if len(predictions) != len(references):
+            raise ValueError(
+                f"predictions and references must have same length, "
+                f"got {len(predictions)} and {len(references)}"
+            )
         predictions_matches = [
             TRAM_ANSWER_REGEX.search(prediction) for prediction in predictions
         ]