Spaces:

aauss
/

timebench_eval

Sleeping

App Files Files Community

aauss commited on Jan 13

Commit

fb49a5d

1 Parent(s): 6bb843b

Implement TimeDial evaluation.

Browse files

Files changed (6) hide show

tests/conftest.py +45 -0
tests/test_answer_extraction.py +8 -1
tests/test_metrics.py +34 -1
timebench_eval/.gitattributes +0 -35
timebench_eval/timebench_eval.py +139 -21
uv.lock +0 -0

tests/conftest.py CHANGED Viewed

@@ -107,3 +107,48 @@ PREDICTION_4 = dedent("""\
     Since the only team he left in 2002 is Cardiff City, and the timeframe in question is 1998–2000, it is the most likely candidate.
     Thus, the correct answer is: Cardiff City.""")

     Since the only team he left in 2002 is Cardiff City, and the timeframe in question is 1998–2000, it is the most likely candidate.
     Thus, the correct answer is: Cardiff City.""")
+PREDICTION_5 = dedent("""\
+    Let's analyze the dialogue step by step to determine what makes the most sense in the context of the <mask>.
+    Dialogue:
+    Person1: What did you say?
+    Person2: I said it's a lovely day. Why don't we go for a walk?
+    Person1: Well, I feel a little tired.
+    Person2: Come on! A little labor, much health.
+    Person1: Then can you wait a few minutes? I want to finish writing this letter.
+    Person2: Don't take too long. It would be a shame not to take advantage of such lovely weather.
+    Person1: I won't be long. <MASK>. Why don't you go ahead and I'll meet you in the park?
+    Person2: I believe I will. Look for me near the lake.
+    We are to choose appropriate options to substitute the <mask>.
+    Now, evaluate the options:
+    A. No more than ten months
+    B. No more than ten minutes
+    C. No more than five minutes
+    D. No more than two years
+    Contextual Clue:
+    Person1 says "I won't be long" — implying a short time.
+    They are still writing a letter, and the second person is suggesting they go for a walk now.
+    The weather is lovely, and the second person is urging them not to delay.
+    So, the time frame must be very short — plausible in the context of finishing a letter.
+    Option A: "No more than ten months" — that's a long time. Doesn’t align with "I won't be long."
+    Option B: "No more than ten minutes" — reasonable, short time, fits with "won't be long."
+    Option C: "No more than five minutes" — even shorter, very plausible and fits better with "won't be long."
+    Option D: "No more than two years" — extremely long — totally inconsistent with the context.
+    So, B and C are both reasonable and within the context.
+    Both are short durations and reasonable for finishing a letter.
+    Note: The sentence says: "I won't be long. <MASK>. Why don't you go ahead..." — so the <mask> is a time commitment, and the next sentence is an invitation for the other person to go ahead.
+    Therefore, the correct options are those that convey a short time frame — clearly B and C.
+    A and D are implausible — too long.
+    Thus, the correct answer is: B, C.""")

tests/test_answer_extraction.py CHANGED Viewed

@@ -1,6 +1,12 @@
 import pytest
 from timebench_eval.timebench_eval import TimebenchEval
-from conftest import PREDICTION_1, PREDICTION_2, PREDICTION_3, PREDICTION_4
 @pytest.mark.parametrize(
@@ -10,6 +16,7 @@ from conftest import PREDICTION_1, PREDICTION_2, PREDICTION_3, PREDICTION_4
         (PREDICTION_2, "August 1804"),
         (PREDICTION_3, "unanswerable"),
         (PREDICTION_4, "Cardiff City"),
     ],
 )
 def test_answer_extraction(prediction, extracted_answer):

 import pytest
 from timebench_eval.timebench_eval import TimebenchEval
+from conftest import (
+    PREDICTION_1,
+    PREDICTION_2,
+    PREDICTION_3,
+    PREDICTION_4,
+    PREDICTION_5,
+)
 @pytest.mark.parametrize(
         (PREDICTION_2, "August 1804"),
         (PREDICTION_3, "unanswerable"),
         (PREDICTION_4, "Cardiff City"),
+        (PREDICTION_5, "B, C"),
     ],
 )
 def test_answer_extraction(prediction, extracted_answer):

tests/test_metrics.py CHANGED Viewed

@@ -1,6 +1,12 @@
 from timebench_eval.timebench_eval import TimebenchEval
 import pytest
-from conftest import PREDICTION_1, PREDICTION_2, PREDICTION_3, PREDICTION_4
 @pytest.mark.parametrize(
@@ -41,6 +47,33 @@ from conftest import PREDICTION_1, PREDICTION_2, PREDICTION_3, PREDICTION_4
                 "f1": [1],
             },
         ),
     ],
 )
 def test_eval(prediction, reference, task, expected_metrics):

 from timebench_eval.timebench_eval import TimebenchEval
 import pytest
+from conftest import (
+    PREDICTION_1,
+    PREDICTION_2,
+    PREDICTION_3,
+    PREDICTION_4,
+    PREDICTION_5,
+)
 @pytest.mark.parametrize(
                 "f1": [1],
             },
         ),
+        (
+            PREDICTION_5,
+            "B. No more than ten minutes && C. No more than five minutes",
+            "TimeDial",
+            {
+                "exact_match": [1],
+                "f1": [1],
+            },
+        ),
+        (
+            PREDICTION_5,
+            "B.",
+            "TimeDial",
+            {
+                "exact_match": [0],
+                "f1": [pytest.approx(2 / 3, rel=1e-6)],
+            },
+        ),
+        (
+            PREDICTION_5,
+            "A.",
+            "TimeDial",
+            {
+                "exact_match": [0],
+                "f1": [0],
+            },
+        ),
     ],
 )
 def test_eval(prediction, reference, task, expected_metrics):

timebench_eval/.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

timebench_eval/timebench_eval.py CHANGED Viewed

@@ -13,13 +13,14 @@
 # limitations under the License.
 """TODO: Add a description here."""
 from dateutil import parser
 from dateutil.parser import ParserError
 import evaluate
 import datasets
-import numpy as np
 # TODO: Add BibTeX citation
@@ -92,6 +93,31 @@ class TimebenchEval(evaluate.Metric):
             reference_urls=["http://path.to.reference.url/new_module"],
         )
     @staticmethod
     def _extract_answer(response: str) -> str | None:
         """Extract the answer from the response"""
@@ -107,7 +133,44 @@ class TimebenchEval(evaluate.Metric):
             return "unanswerable"
         return answer or None
-    def _call_squad(self, predictions, references):
         exact_matches = []
         f1_scores = []
@@ -116,7 +179,7 @@ class TimebenchEval(evaluate.Metric):
                 {"id": "0", "prediction_text": self._extract_answer(pred)}
             ]
             formatted_ref = [
-                {"id": "0", "answers": {"text": [self._extract_answer(ref)], "answer_start": [0]}}
             ]
             results = self.squad_metric.compute(
@@ -130,14 +193,19 @@ class TimebenchEval(evaluate.Metric):
             "f1": f1_scores,
         }
-    @staticmethod
-    def _parse_historical_date(date_str):
-        try:
-            return parser.parse(date_str).replace(day=1)
-        except ParserError:
-            return None
-    def _compare_dates(self, predictions, references):
         predictions = [
             self._parse_historical_date(self._extract_answer(pred))
             for pred in predictions
@@ -149,13 +217,63 @@ class TimebenchEval(evaluate.Metric):
             ],
         }
-    def _compute(self, predictions, references, task: str):
-        """Returns the scores"""
-        if task in [
-            "TempReason",
-            "TimeQA",
-            "MenatQA",
-        ]:
-            return self._call_squad(predictions, references)
-        elif task == "Date Arithmetic":
-            return self._compare_dates(predictions, references)

 # limitations under the License.
 """TODO: Add a description here."""
+import re
+from datetime import datetime
 from dateutil import parser
 from dateutil.parser import ParserError
 import evaluate
 import datasets
 # TODO: Add BibTeX citation
             reference_urls=["http://path.to.reference.url/new_module"],
         )
+    def _compute(
+        self, predictions: list[str], references: list[str], task: str
+    ) -> dict[str, list[float]]:
+        """
+        Compute evaluation metrics for the given predictions and references.
+        Args:
+            predictions: List of prediction strings to evaluate.
+            references: List of reference strings to compare against.
+            task: Task type, one of: "TempReason", "TimeQA", "MenatQA", "Date Arithmetic", "TimeDial".
+        Returns:
+            Dictionary containing metric scores (exact_match and/or f1) as lists of floats.
+        """
+        if task in [
+            "TempReason",
+            "TimeQA",
+            "MenatQA",
+        ]:
+            return self._call_squad(predictions, references)
+        elif task == "Date Arithmetic":
+            return self._compare_dates(predictions, references)
+        elif task == "TimeDial":
+            return self._compute_timedial(predictions, references)
     @staticmethod
     def _extract_answer(response: str) -> str | None:
         """Extract the answer from the response"""
             return "unanswerable"
         return answer or None
+    def _extract_selected_options(self, text: str) -> set[str]:
+        """
+        Extract selected option letters (A, B, C, D) from various formats:
+        - "B, C"
+        - "B and C"
+        - "B & C"
+        - "B && C"
+        - "B. No more than ten minutes && C. No more than five minutes"
+        - "Options B and C"
+        - "The answer is B, C"
+        """
+        if not text:
+            return set()
+        # Pattern matches option letters that appear:
+        # 1. At word boundary followed by period, comma, space, &, or end: \b[A-D](?=[.\s,&]|$)
+        # 2. This avoids matching letters inside words like "CAD" or "BAD"
+        # Find all A, B, C, D that look like option selections
+        # They should be at a word boundary and followed by typical delimiters
+        pattern = r"\b([A-D])(?:\.|,|\s|&|$)"
+        matches = re.findall(pattern, text)
+        return set(matches)
+    def _call_squad(
+        self, predictions: list[str], references: list[str]
+    ) -> dict[str, list[float]]:
+        """
+        Compute SQuAD metrics (Exact Matchand F1) for predictions and references.
+        Args:
+            predictions: List of prediction strings.
+            references: List of reference answer strings.
+        Returns:
+            Dictionary with "exact_match" and "f1" keys, each containing a list of scores.
+        """
         exact_matches = []
         f1_scores = []
                 {"id": "0", "prediction_text": self._extract_answer(pred)}
             ]
             formatted_ref = [
+                {"id": "0", "answers": {"text": [ref], "answer_start": [0]}}
             ]
             results = self.squad_metric.compute(
             "f1": f1_scores,
         }
+    def _compare_dates(
+        self, predictions: list[str], references: list[str]
+    ) -> dict[str, list[int]]:
+        """
+        Parses and compares dates in predictions and references for exact match.
+        Args:
+            predictions: List of prediction strings containing dates.
+            references: List of reference date strings.
+        Returns:
+            Dictionary with "exact_match" key containing a list of 0/1 scores.
+        """
         predictions = [
             self._parse_historical_date(self._extract_answer(pred))
             for pred in predictions
             ],
         }
+    def _compute_timedial(
+        self, predictions: list[str], references: list[str]
+    ) -> dict[str, list[float]]:
+        """
+        Compute TimeDial metrics (Exact Match and F1) using set-based comparison of selected options.
+        Args:
+            predictions: List of prediction strings.
+            references: List of reference strings containing selected options.
+        Returns:
+            Dictionary with "exact_match" and "f1" keys, each containing a list of scores.
+        """
+        exact_matches = []
+        f1_scores = []
+        for pred, ref in zip(predictions, references):
+            pred_answer = self._extract_answer(pred)  # Get text after marker
+            pred_options = (
+                self._extract_selected_options(pred_answer) if pred_answer else set()
+            )
+            ref_options = self._extract_selected_options(ref)
+            # Exact match: sets must be identical
+            em = 1 if pred_options == ref_options else 0
+            exact_matches.append(em)
+            # F1: set-based
+            if not pred_options and not ref_options:
+                f1 = 1.0  # Both empty = perfect match
+            elif not pred_options or not ref_options:
+                f1 = 0.0  # One empty, one not
+            else:
+                tp = len(pred_options & ref_options)
+                precision = tp / len(pred_options)
+                recall = tp / len(ref_options)
+                f1 = (
+                    2 * precision * recall / (precision + recall)
+                    if (precision + recall) > 0
+                    else 0.0
+                )
+            f1_scores.append(f1)
+        return {"exact_match": exact_matches, "f1": f1_scores}
+    @staticmethod
+    def _parse_historical_date(date_str: str) -> datetime | None:
+        """
+        Parse a date string and return a datetime object with day set to 1.
+        Args:
+            date_str: String representation of a date.
+        Returns:
+            datetime object with day set to 1, or None if parsing fails.
+        """
+        try:
+            return parser.parse(date_str).replace(day=1)
+        except ParserError:
+            return None

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff