Spaces:

sorgfresser
/

valid_efficiency_score

Sleeping

App Files Files Community

Simon Sorg commited on Jan 10, 2024

Commit

b954f30

1 Parent(s): c897860

feat: add compute

Browse files

Files changed (1) hide show

valid_efficiency_score.py +107 -29

valid_efficiency_score.py CHANGED Viewed

@@ -11,46 +11,49 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""TODO: Add a description here."""
 import evaluate
 import datasets
-# TODO: Add BibTeX citation
 _CITATION = """\
-@InProceedings{huggingface:module,
-title = {A great new module},
-authors={huggingface, Inc.},
-year={2020}
 }
 """
-# TODO: Add description of the module here
 _DESCRIPTION = """\
-This new module is designed to solve this great ML task and is crafted with a lot of care.
 """
-# TODO: Add description of the arguments of the module here
 _KWARGS_DESCRIPTION = """
-Calculates how good are predictions given some references, using certain scores
 Args:
     predictions: list of predictions to score. Each predictions
         should be a string with tokens separated by spaces.
     references: list of reference for each prediction. Each
         reference should be a string with tokens separated by spaces.
 Returns:
-    accuracy: description of the first score,
-    another_score: description of the second score,
 Examples:
-    Examples should be written in doctest format, and should illustrate how
-    to use the function.
-    >>> my_new_module = evaluate.load("my_new_module")
     >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
     >>> print(results)
-    {'accuracy': 1.0}
 """
 # TODO: Define external resources urls if needed
@@ -59,10 +62,9 @@ BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class ValidEfficiencyScore(evaluate.Metric):
-    """TODO: Short description of my evaluation module."""
     def _info(self):
-        # TODO: Specifies the evaluate.EvaluationModuleInfo object
         return evaluate.MetricInfo(
             # This is the description that will appear on the modules page.
             module_type="metric",
@@ -71,14 +73,14 @@ class ValidEfficiencyScore(evaluate.Metric):
             inputs_description=_KWARGS_DESCRIPTION,
             # This defines the format of each prediction and reference
             features=datasets.Features({
-                'predictions': datasets.Value('int64'),
-                'references': datasets.Value('int64'),
             }),
             # Homepage of the module for documentation
             homepage="http://module.homepage",
             # Additional links to the codebase or references
             codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
-            reference_urls=["http://path.to.reference.url/new_module"]
         )
     def _download_and_prepare(self, dl_manager):
@@ -86,10 +88,86 @@ class ValidEfficiencyScore(evaluate.Metric):
         # TODO: Download external resources if needed
         pass
-    def _compute(self, predictions, references):
-        """Returns the scores"""
         # TODO: Compute the different scores of the module
-        accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
         return {
-            "accuracy": accuracy,
-        }

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+#
+# This is a module to compute the Valid Efficiency Score (VES) of a model's predictions for text-to-SQL tasks as
+# proposed in "Can LLM Already Serve as a Database Interface?
+# A Big Bench for Large-Scale Database Grounded Text-to-SQLs" (Li et al., 2023)
 import evaluate
 import datasets
+from time import time
+import numpy as np
+from math import sqrt
 _CITATION = """\
+@article{li2023can,
+  title={Can llm already serve as a database interface? a big bench for large-scale database grounded text-to-sqls},
+  author={Li, Jinyang and Hui, Binyuan and Qu, Ge and Li, Binhua and Yang, Jiaxi and Li, Bowen and Wang, Bailin and Qin, Bowen and Cao, Rongyu and Geng, Ruiying and others},
+  journal={arXiv preprint arXiv:2305.03111},
+  year={2023}
 }
 """
 _DESCRIPTION = """\
+This module computes the Valid Efficiency Score (VES) of a model's predictions for text-to-SQL tasks.
 """
 _KWARGS_DESCRIPTION = """
+Calculates how good the predictions are given some ground truth sql queries, using the Valid Efficiency Score (VES).
 Args:
     predictions: list of predictions to score. Each predictions
         should be a string with tokens separated by spaces.
     references: list of reference for each prediction. Each
         reference should be a string with tokens separated by spaces.
+    execute: function that takes a list of sql queries and returns a list of results, one for each query.
+        Results should be a list of tuples, each tuple containing the values of a row.
+    filter_func: function that takes a string and returns a boolean.
+        If True, the string is kept, otherwise it is dropped.
+    num_executions: number of times to execute each sql query to get the execution time.
 Returns:
+    ves: Valid Efficiency Score of the predictions compared to the references.
 Examples:
+    >>> my_new_module = evaluate.load("valid_efficiency_score")
     >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
     >>> print(results)
+    {'ves': 1.0}
 """
 # TODO: Define external resources urls if needed
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class ValidEfficiencyScore(evaluate.Metric):
+    """Valid Efficiency Score (VES) metric for text-to-SQL tasks."""
     def _info(self):
         return evaluate.MetricInfo(
             # This is the description that will appear on the modules page.
             module_type="metric",
             inputs_description=_KWARGS_DESCRIPTION,
             # This defines the format of each prediction and reference
             features=datasets.Features({
+                'predictions': datasets.Value('string'),
+                'references': datasets.Value('string'),
             }),
             # Homepage of the module for documentation
             homepage="http://module.homepage",
             # Additional links to the codebase or references
             codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
+            reference_urls=[]
         )
     def _download_and_prepare(self, dl_manager):
         # TODO: Download external resources if needed
         pass
+    def _compute(self, predictions, references, execute, filter_func=None, num_executions=100):
+        """Returns the valid efficiency score of the predictions compared to the references."""
         # TODO: Compute the different scores of the module
+        if len(predictions) != len(references):
+            raise ValueError("Predictions and references must have the same number of elements.")
+        # Run filter_func on predictions and references if needed
+        filtered_predictions = []
+        filtered_references = []
+        passing_reference_only = 0
+        if filter_func is not None:
+            for prediction, reference in zip(predictions, references):
+                # Only keep if both prediction and reference pass the filter
+                if filter_func(prediction) and filter_func(reference):
+                    filtered_predictions.append(prediction)
+                    filtered_references.append(reference)
+                # If only the reference passes the filter, count it
+                elif filter_func(reference):
+                    passing_reference_only += 1
+        # Execute ground truth sql queries to get the ground truth results and the time it takes to execute them
+        ground_results = execute(filtered_references)
+        reference_times = np.zeros(num_executions)
+        for i in range(num_executions):
+            start_time = time()
+            execute(filtered_references)
+            end_time = time()
+            reference_times[i] = end_time - start_time
+        # Execute predicted sql queries to get the predicted results and the time it takes to execute them
+        predicted_results = execute(filtered_predictions)
+        prediction_times = np.zeros(num_executions)
+        for i in range(num_executions):
+            start_time = time()
+            execute(filtered_predictions)
+            end_time = time()
+            prediction_times[i] = end_time - start_time
+        # Get mean, std and 3 sigma interval
+        reference_mean = np.mean(reference_times)
+        reference_std = np.std(reference_times)
+        lower_bound = reference_mean - 3 * reference_std
+        upper_bound = reference_mean + 3 * reference_std
+        # Drop outliers
+        filtered_reference_times = reference_times[(reference_times >= lower_bound) & (reference_times <= upper_bound)]
+        # Get mean, std and 3 sigma interval
+        prediction_mean = np.mean(prediction_times)
+        prediction_std = np.std(prediction_times)
+        lower_bound = prediction_mean - 3 * prediction_std
+        upper_bound = prediction_mean + 3 * prediction_std
+        # Drop outliers
+        filtered_prediction_times = prediction_times[
+            (prediction_times >= lower_bound) & (prediction_times <= upper_bound)]
+        # Get new means as e_scores
+        reference_mean = np.mean(filtered_reference_times)
+        prediction_mean = np.mean(filtered_prediction_times)
+        r_value = sqrt(reference_mean / prediction_mean)
+        # Run filter_func on predictions and references if needed
+        filtered_predictions = []
+        filtered_references = []
+        divider = 0
+        if filter_func is not None:
+            for prediction, reference in zip(predictions, references):
+                # Only keep if both prediction and reference pass the filter
+                pred_bool = filter_func(prediction)
+                ref_bool = filter_func(reference)
+                if pred_bool and ref_bool:
+                    filtered_predictions.append(prediction)
+                    filtered_references.append(reference)
+                    divider += 1
+                # If only the reference passes the filter, count it
+                elif pred_bool != ref_bool:
+                    divider += 1
+        accuracy = sum(
+            execute(i) == execute(j) for i, j in zip(filtered_predictions, filtered_references)) / divider
         return {
+            "ves": accuracy * r_value,
+        }