Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on Dec 3, 2025

Commit

a62442d

verified ·

1 Parent(s): c160aec

Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

inference.py +2 -0
metric_utils.py +3 -1
metrics.py +380 -3
operators.py +103 -0
splitters.py +4 -1
text2sql_utils.py +3 -1
version.py +1 -1

inference.py CHANGED Viewed

@@ -3268,6 +3268,7 @@ class CrossProviderInferenceEngine(
     provider_model_map: Dict[_supported_apis, Dict[str, str]] = {
         "watsonx-sdk": {  # checked from ibm_watsonx_ai.APIClient().foundation_models.ChatModels
             "granite-20b-code-instruct": "ibm/granite-20b-code-instruct",
             "granite-3-2b-instruct": "ibm/granite-3-2b-instruct",
             "granite-3-8b-instruct": "ibm/granite-3-8b-instruct",
@@ -3290,6 +3291,7 @@ class CrossProviderInferenceEngine(
             "llama-3-2-3b-instruct": "meta-llama/llama-3-2-3b-instruct",
             "llama-3-2-90b-vision-instruct": "meta-llama/llama-3-2-90b-vision-instruct",
             "llama-3-3-70b-instruct": "meta-llama/llama-3-3-70b-instruct",
             "llama-guard-3-11b-vision": "meta-llama/llama-guard-3-11b-vision",
             "mixtral-8x7b-instruct-v01": "mistralai/mixtral-8x7b-instruct-v01",
             "mistral-small-instruct": "mistralai/mistral-small-3-1-24b-instruct-2503",

     provider_model_map: Dict[_supported_apis, Dict[str, str]] = {
         "watsonx-sdk": {  # checked from ibm_watsonx_ai.APIClient().foundation_models.ChatModels
+            "gpt-oss-120b": "openai/gpt-oss-120b",
             "granite-20b-code-instruct": "ibm/granite-20b-code-instruct",
             "granite-3-2b-instruct": "ibm/granite-3-2b-instruct",
             "granite-3-8b-instruct": "ibm/granite-3-8b-instruct",
             "llama-3-2-3b-instruct": "meta-llama/llama-3-2-3b-instruct",
             "llama-3-2-90b-vision-instruct": "meta-llama/llama-3-2-90b-vision-instruct",
             "llama-3-3-70b-instruct": "meta-llama/llama-3-3-70b-instruct",
+            "llama-4-maverick": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8",
             "llama-guard-3-11b-vision": "meta-llama/llama-guard-3-11b-vision",
             "mixtral-8x7b-instruct-v01": "mistralai/mixtral-8x7b-instruct-v01",
             "mistral-small-instruct": "mistralai/mistral-small-3-1-24b-instruct-2503",

metric_utils.py CHANGED Viewed

@@ -733,7 +733,9 @@ class InstanceScores(list):
         ).head()
         df["score_name"] = df["score"].apply(lambda x: x["instance"]["score_name"])
         df["all_scores"] = df["score"].apply(
-            lambda x: "\n".join(f"{k}: {v}" for k, v in x["instance"].items())
         )
         df["score"] = df["score"].apply(lambda x: x["instance"]["score"])

         ).head()
         df["score_name"] = df["score"].apply(lambda x: x["instance"]["score_name"])
         df["all_scores"] = df["score"].apply(
+            lambda x: "\n".join(
+                f"{k}: {v}" for k, v in x["instance"].items() if isoftype(v, float)
+            )
         )
         df["score"] = df["score"].apply(lambda x: x["instance"]["score"])

metrics.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import ast
 import json
 import math
 import os
@@ -17,6 +18,7 @@ from typing import (
     Dict,
     Generator,
     Generic,
     List,
     Literal,
     Optional,
@@ -891,6 +893,375 @@ class MultiTurnToolCallingMetric(ReductionInstanceMetric[str, Dict[str, float]])
         }
 class MetricWithConfidenceInterval(Metric):
     # The number of resamples used to estimate the confidence intervals of this metric.
     # Use None to disable confidence interval computation.
@@ -4889,12 +5260,11 @@ class FaithfulnessHHEM(BulkInstanceMetric):
     # single_reference_per_prediction = True
     max_context_words = 4096
     reduction_map = {"mean": [main_score]}
     _requirements_list: List[str] = ["transformers", "torch"]
-    @retry_connection_with_exponential_backoff(backoff_factor=2)
-    def prepare(self):
-        super().prepare()
         import torch
         if torch.cuda.is_available():
@@ -4912,6 +5282,11 @@ class FaithfulnessHHEM(BulkInstanceMetric):
             model_path, trust_remote_code=True
         ).to(device)
     def compute(
         self,
         references: List[List[Any]],
@@ -4920,6 +5295,8 @@ class FaithfulnessHHEM(BulkInstanceMetric):
     ) -> List[Dict[str, Any]]:
         from tqdm import tqdm
         # treat the references as the contexts and the predictions as answers
         # concat references

 import ast
+import asyncio
 import json
 import math
 import os
     Dict,
     Generator,
     Generic,
+    Iterable,
     List,
     Literal,
     Optional,
         }
+class ReflectionToolCallingMixin:
+    @staticmethod
+    def convert_tools_inventory(tools):
+        from llmevalkit.function_calling.pipeline.types import (
+            ToolSpec as LLMEvalKitToolSpec,
+        )
+        return [
+            LLMEvalKitToolSpec(
+                type="function",
+                function={**tool},
+            )
+            for tool in tools
+        ]
+    @staticmethod
+    def convert_tool_call(prediction: ToolCall):
+        from llmevalkit.function_calling.pipeline.types import (
+            ToolCall as LLMEvalKitToolCall,
+        )
+        return LLMEvalKitToolCall(
+            type="function",
+            function={
+                "name": prediction["name"],
+                "arguments": json.dumps(prediction["arguments"]),
+                "parsed_arguments": prediction["arguments"],
+            },
+        )
+class ReflectionToolCallingMetric(ReductionInstanceMetric[str, Dict[str, float]]):
+    """Measures syntactic and semantic validity of tool calls.
+    The final output contains two main fields: "semantic" and "static" (i.e., semantic).
+    Under the semantics we define two types of metrics: general and function selection.
+    General metrics evaluate the overall quality and correctness of the tool call.
+    These metrics contains:
+     1. General hallucination check: Evaluate whether each parameter value in the function call is correct and directly supported by the provided conversation history and adhere the tool specifications.
+     2. Value format alignment: Check if the format of the parameter values aligns with the expected formats defined in the tool specifications.
+    Function selection metrics evaluate the appropriateness of the selected function for the given context.
+    These metrics include:
+     1. Function selection appropriateness: Assess whether the chosen function is suitable for the task at hand.
+     2. Agentic constraints satisfaction: Assess whether the proposed tool call satisfies all agentic constraints required for execution.
+    Static metrics evaluate the syntactic validity of the tool call.
+    It contains the following metrics:
+    - non_existent_function: tool name not found.
+    - non_existent_parameter: argument name not in tool spec.
+    - incorrect_parameter_type: argument type mismatch.
+    - missing_required_parameter: required argument missing.
+    - allowed_values_violation: argument value outside allowed set.
+    - json_schema_violation: call violates JSON schema.
+    - empty_api_spec: no tool spec provided.
+    - invalid_api_spec: tool spec is invalid.
+    - invalid_tool_call: call is not a valid tool invocation.
+    - overall_valid: validity of the call (main score).
+    - score: alias of overall_valid.
+    Here is an example for a aggregated reflection output after calling reduce.
+    The range of each score is [0, 1] (where higher indicates less errors).
+    {
+        "static_non_existent_function": 1.0,
+        "static_non_existent_parameter": 1.0,
+        "static_incorrect_parameter_type": 1.0,
+        "static_missing_required_parameter": 1.0,
+        "static_allowed_values_violation": 1.0,
+        "static_json_schema_violation": 1.0,
+        "static_empty_api_spec": 1.0,
+        "static_invalid_api_spec": 1.0,
+        "static_invalid_tool_call": 1.0,
+        "semantic_general_hallucination_check": 0.0,
+        "semantic_general_value_format_alignment": 0.0,
+        "semantic_avg_score_general": 1.0,
+        "semantic_function_selection_appropriateness": 0.0,
+        "semantic_agentic_constraints_satisfaction": 0.0,
+        "semantic_avg_score_function_selection": 1.0,
+        "overall_valid": 1.0
+    }
+    Where overall_valid is the final decision made by the reflection pipeline, indicating whether the tool call is valid or not.
+    Before the aggregation each metric contains also evidence, explanation, a more fine-grained score, etc.
+    Reference: https://github.ibm.com/MLT/LLMEvalKit
+    """
+    main_score = "overall_valid"
+    reduction = MeanReduction()
+    prediction_type = ToolCall
+    _requirements_list = {
+        "llmevalkit": "Install with \"pip install 'git+ssh://git@github.ibm.com/MLT/LLMEvalKit.git'\".\nTo gain access please reach the team."
+    }
+    runtime_pipeline: bool = True  # Whether to use the runtime pipeline or the longer evaluation pipeline with actionable recommendations
+    def prepare(self):
+        provider_to_default_reflector_model = {
+            "watsonx": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8",
+            "open-ai": "gpt-4o",
+            "rits": "openai/gpt-oss-120b",
+            "azure": "gpt-4o",
+            "mock": "mock",
+        }
+        provider = (
+            settings.default_provider if not settings.mock_inference_mode else "mock"
+        )
+        if provider not in provider_to_default_reflector_model:
+            raise ValueError(
+                f"Unsupported provider for ReflectionToolCallingMetric: {provider}. Supported providers are: {list(provider_to_default_reflector_model.keys())}"
+            )
+        self.requirements = self._get_missing_requirements_by_provider(provider)
+        super().prepare()
+        self.setup_pipeline(
+            reflector_model_name=provider_to_default_reflector_model.get(provider),
+            provider_name=provider,
+        )
+    def setup_pipeline(
+        self, reflector_model_name: str, provider_name: Optional[str] = None
+    ):
+        if provider_name:
+            llmeval_provider_name = self._get_llmeval_provider_name(provider_name)
+            requirements = self._get_missing_requirements_by_provider(provider_name)
+            self.check_missing_requirements(requirements)
+        metrics_client = self._get_metrics_client(
+            llmeval_provider_name, reflector_model_name
+        )
+        self.reflection_pipeline = self._build_reflection_pipeline(metrics_client)
+        return self.reflection_pipeline
+    @staticmethod
+    def _get_llmeval_provider_name(provider_name: str) -> str:
+        mapping = {
+            "watsonx": "watsonx.output_val",
+            "open-ai": "openai.async.output_val",
+            "rits": "litellm.rits.output_val",
+            "azure": "azure_openai.async.output_val",
+            "mock": "mock.output_val",
+        }
+        llmeval_provider_name = mapping.get(provider_name)
+        if llmeval_provider_name is None:
+            raise ValueError(f"Unsupported provider by llmevalkit: {provider_name}")
+        return llmeval_provider_name
+    @staticmethod
+    def _get_missing_requirements_by_provider(provider_name: str):
+        provider_libs = {
+            "watsonx": "ibm_watsonx_ai",
+            "open-ai": "openai",
+            "rits": "litellm",
+            "azure": "openai",
+        }
+        required_lib = provider_libs.get(provider_name)
+        return [required_lib] if required_lib else []
+    @staticmethod
+    def _get_metrics_client(llmeval_provider_name: str, reflector_model_name: str):
+        from llmevalkit.llm import get_llm
+        metrics_client_cls = get_llm(llmeval_provider_name)
+        return metrics_client_cls(model_name=reflector_model_name)
+    def _build_reflection_pipeline(self, metrics_client):
+        from llmevalkit.function_calling.consts import (
+            METRIC_AGENTIC_CONSTRAINTS_SATISFACTION,
+            METRIC_FUNCTION_SELECTION_APPROPRIATENESS,
+            METRIC_GENERAL_HALLUCINATION_CHECK,
+            METRIC_GENERAL_VALUE_FORMAT_ALIGNMENT,
+        )
+        from llmevalkit.function_calling.pipeline.pipeline import ReflectionPipeline
+        return ReflectionPipeline(
+            metrics_client=metrics_client,
+            general_metrics=[
+                METRIC_GENERAL_HALLUCINATION_CHECK,
+                METRIC_GENERAL_VALUE_FORMAT_ALIGNMENT,
+            ],
+            function_metrics=[
+                METRIC_FUNCTION_SELECTION_APPROPRIATENESS,
+                METRIC_AGENTIC_CONSTRAINTS_SATISFACTION,
+            ],
+            parameter_metrics=[],
+            runtime_pipeline=self.runtime_pipeline,
+        )
+    async def map(
+        self,
+        prediction: ToolCall,
+        references: None,
+        task_data: Dict[str, Any],
+    ):
+        from llmevalkit.function_calling.pipeline.types import PipelineResult
+        # Convert unitxt dialog to LLMEvalKit format
+        if "dialog" in task_data:
+            conversation_history = [dict(turn) for turn in task_data["dialog"]]
+        elif "query" in task_data:
+            conversation_history = [{"role": "user", "content": task_data["query"]}]
+        else:
+            raise ValueError("task_data must contain either 'dialog' or 'query' field.")
+        # Convert unitxt tool inventory to LLMEvalKit format
+        tools_inventory = ReflectionToolCallingMixin.convert_tools_inventory(
+            task_data.get("tools", [])
+        )
+        # Convert unitxt tool call to LLMEvalKit format
+        tool_call_converted = ReflectionToolCallingMixin.convert_tool_call(prediction)
+        # Run reflection (syntactic + semantic)
+        result: PipelineResult = await self.reflection_pipeline.run_async(
+            conversation=conversation_history,
+            inventory=tools_inventory,
+            call=tool_call_converted,
+            retries=3,
+            continue_on_static=True,
+        )
+        result_dict = result.model_dump()
+        result_dict["overall_valid"] = float(result_dict["overall_valid"])
+        return result_dict
+    def map_stream(
+        self,
+        items: Iterable[Tuple[ToolCall, None, Dict[str, Any]]],
+        *,
+        max_concurrency: int = 8,
+    ) -> List[Dict[str, Any]]:
+        """Run self.map in parallel over an iterable and return results in order."""
+        async def process_all():
+            items_iter = iter(enumerate(items))
+            results = []
+            pending = set()
+            while True:
+                while len(pending) < max_concurrency:
+                    try:
+                        idx, (pred, refs, data) = next(items_iter)
+                        if isinstance(pred, list):
+                            for p in pred:
+                                task = asyncio.create_task(self.map(p, refs, data))
+                                task.idx = idx
+                                pending.add(task)
+                        else:
+                            task = asyncio.create_task(self.map(pred, refs, data))
+                            task.idx = idx
+                            pending.add(task)
+                    except StopIteration:
+                        break
+                if not pending:
+                    break
+                done, pending = await asyncio.wait(
+                    pending, return_when=asyncio.FIRST_COMPLETED
+                )
+                for task in done:
+                    results.append((task.idx, await task))
+            results.sort()
+            return [r for _, r in results]
+        return asyncio.run(process_all())
+    def reduce_one(self, intermidate: Dict[str, Any]) -> Dict[str, float]:
+        return intermidate
+    def reduce(self, intermidates: List[Dict[str, Any]]) -> Dict[str, float]:
+        flat_instances = []
+        for instance in intermidates:
+            flat_instance_dict = {}
+            for metric, metric_type_dict in (
+                instance.get("static", {}).get("metrics", {}).items()
+            ):
+                flat_instance_dict[f"static_{metric}"] = float(
+                    metric_type_dict["valid"]
+                )
+            for metric_type, metric_type_dict in instance.get("semantic", {}).items():
+                if metric_type_dict is not None:
+                    for metric, metric_dict in metric_type_dict.get(
+                        "metrics", {}
+                    ).items():
+                        flat_instance_dict[f"semantic_{metric}"] = 1 - float(
+                            metric_dict["is_issue"]
+                        )
+                    flat_instance_dict[f"semantic_avg_score_{metric_type}"] = float(
+                        metric_type_dict.get("avg_score")
+                    )
+            flat_instance_dict["overall_valid"] = float(instance["overall_valid"])
+            flat_instances.append(flat_instance_dict)
+        return self.reduction.reduce(flat_instances)
+class ReflectionToolCallingMetricSyntactic(
+    ReductionInstanceMetric[str, Dict[str, float]]
+):
+    """Measures syntactic and schema validity of tool calls.
+    Range: [0, 1] (higher indicates less errors).
+    Returns 1.0 if the tool call is valid for each metric, 0.0 otherwise.
+    overall_valid equals 1.0 if all metrics are valid, 0.0 otherwise.
+    Global score is the percentage of valid instances across the dataset.
+    Scores:
+    - non_existent_function: tool name not found.
+    - non_existent_parameter: argument name not in tool spec.
+    - incorrect_parameter_type: argument type mismatch.
+    - missing_required_parameter: required argument missing.
+    - allowed_values_violation: argument value outside allowed set.
+    - json_schema_violation: call violates JSON schema.
+    - empty_api_spec: no tool spec provided.
+    - invalid_api_spec: tool spec is invalid.
+    - invalid_tool_call: call is not a valid tool invocation.
+    - overall_valid: validity of the call (main score).
+    - score: alias of overall_valid.
+    Reference: https://github.ibm.com/MLT/LLMEvalKit
+    """
+    main_score = "overall_valid"
+    reduction = MeanReduction()
+    prediction_type = ToolCall
+    _requirements_list = {
+        "llmevalkit": "Install with \"pip install 'git+ssh://git@github.ibm.com/MLT/LLMEvalKit.git'\".\nTo gain access please reach the team."
+    }
+    def map(
+        self,
+        prediction: ToolCall,
+        references: None,
+        task_data: Dict[str, Any],
+    ) -> Dict[str, float]:
+        from llmevalkit.function_calling.pipeline.pipeline import ReflectionPipeline
+        # Convert unitxt tool inventory to LLMEvalKit format
+        tools_inventory = ReflectionToolCallingMixin.convert_tools_inventory(
+            task_data.get("tools", [])
+        )
+        # Convert unitxt tool call to LLMEvalKit format
+        tool_call = ReflectionToolCallingMixin.convert_tool_call(prediction)
+        # Run static validation
+        static_result = ReflectionPipeline.static_only(tools_inventory, tool_call)
+        result_dict = static_result.model_dump()
+        result_dict["overall_valid"] = float(result_dict.pop("final_decision"))
+        result_dict["metrics"]["json_schema_violation"] = result_dict["metrics"].pop(
+            "json_schema_validation"
+        )
+        return result_dict
+    def reduce_one(self, intermidate: Dict[str, float]) -> Dict[str, float]:
+        return intermidate
+    def reduce(self, intermediates: List[Dict[str, float]]) -> Dict[str, float]:
+        flat_instances = []
+        for instance in intermediates:
+            flat_instance_dict = {}
+            for metric, metric_dict in instance.get("metrics", {}).items():
+                flat_instance_dict[metric] = float(metric_dict["valid"])
+            flat_instance_dict["overall_valid"] = float(instance["overall_valid"])
+            flat_instances.append(flat_instance_dict)
+        return self.reduction.reduce(flat_instances)
 class MetricWithConfidenceInterval(Metric):
     # The number of resamples used to estimate the confidence intervals of this metric.
     # Use None to disable confidence interval computation.
     # single_reference_per_prediction = True
     max_context_words = 4096
     reduction_map = {"mean": [main_score]}
+    model = None
     _requirements_list: List[str] = ["transformers", "torch"]
+    def load_model(self):
         import torch
         if torch.cuda.is_available():
             model_path, trust_remote_code=True
         ).to(device)
+    @retry_connection_with_exponential_backoff(backoff_factor=2)
+    def prepare(self):
+        super().prepare()
+        # load_model() moved from prepare() to compute() because model is gated in HF
     def compute(
         self,
         references: List[List[Any]],
     ) -> List[Dict[str, Any]]:
         from tqdm import tqdm
+        if self.model is None:
+            self.load_model()
         # treat the references as the contexts and the predictions as answers
         # concat references

operators.py CHANGED Viewed

@@ -40,6 +40,7 @@ General Operators List:
 """
 import operator
 import uuid
 import warnings
 import zipfile
@@ -2611,3 +2612,105 @@ class ReadFile(FieldOperator):
         # Read from local file
         with open(value, encoding=self.encoding) as f:
             return f.read()

 """
 import operator
+import re
 import uuid
 import warnings
 import zipfile
         # Read from local file
         with open(value, encoding=self.encoding) as f:
             return f.read()
+class FixJsonSchemaOfParameterTypes(InstanceOperator):
+    main_field: str
+    def prepare(self):
+        self.simple_mapping = {
+            "": "object",
+            "any": "object",
+            "Any": "object",
+            "Array": "array",
+            "ArrayList": "array",
+            "Bigint": "integer",
+            "bool": "boolean",
+            "Boolean": "boolean",
+            "byte": "integer",
+            "char": "string",
+            "dict": "object",
+            "Dict": "object",
+            "double": "number",
+            "float": "number",
+            "HashMap": "object",
+            "Hashtable": "object",
+            "int": "integer",
+            "list": "array",
+            "List": "array",
+            "long": "integer",
+            "Queue": "array",
+            "short": "integer",
+            "Stack": "array",
+            "tuple": "array",
+            "Set": "array",
+            "set": "array",
+            "str": "string",
+            "String": "string",
+        }
+    def dict_type_of(self, type_str: str) -> dict:
+        return {"type": type_str}
+    def recursive_trace_for_type_fields(self, containing_element):
+        if isinstance(containing_element, dict):
+            keys = list(containing_element.keys())
+            for key in keys:
+                if key == "type" and isinstance(containing_element["type"], str):
+                    jsonschema_dict = self.type_str_to_jsonschema_dict(
+                        containing_element["type"]
+                    )
+                    containing_element.pop("type")
+                    containing_element.update(jsonschema_dict)
+                else:
+                    self.recursive_trace_for_type_fields(containing_element[key])
+        elif isinstance(containing_element, list):
+            for list_element in containing_element:
+                self.recursive_trace_for_type_fields(list_element)
+    def type_str_to_jsonschema_dict(self, type_str: str) -> dict:
+        if type_str in self.simple_mapping:
+            return self.dict_type_of(self.simple_mapping[type_str])
+        m = re.match(r"^(List|Tuple)\[(.*?)\]$", type_str)
+        if m:
+            basic_type = self.dict_type_of("array")
+            basic_type["items"] = self.type_str_to_jsonschema_dict(
+                m.group(2) if m.group(1) == "List" else m.group(2).split(",")[0].strip()
+            )
+            return basic_type
+        m = re.match(r"^(Union)\[(.*?)\]$", type_str)
+        if m:
+            args = m.group(2).split(",")
+            for i in range(len(args)):
+                args[i] = args[i].strip()
+            return {"anyOf": [self.type_str_to_jsonschema_dict(arg) for arg in args]}
+        if re.match(r"^(Callable)\[(.*?)\]$", type_str):
+            return self.dict_type_of("object")
+        if "," in type_str:
+            sub_types = type_str.split(",")
+            for i in range(len(sub_types)):
+                sub_types[i] = sub_types[i].strip()
+            assert len(sub_types) in [
+                2,
+                3,
+            ], f"num of subtypes should be 2 or 3, got {type_str}"
+            basic_type = self.type_str_to_jsonschema_dict(sub_types[0])
+            for sub_type in sub_types[1:]:
+                if sub_type.lower().startswith("default"):
+                    basic_type["default"] = re.split(r"[= ]", sub_type, maxsplit=1)[1]
+            for sub_type in sub_types[1:]:
+                if sub_type.lower().startswith("optional"):
+                    return {"anyOf": [basic_type, self.dict_type_of("null")]}
+            return basic_type
+        return self.dict_type_of(type_str)  # otherwise - return what arrived
+    def process(
+        self, instance: Dict[str, Any], stream_name: Optional[str] = None
+    ) -> Dict[str, Any]:
+        assert (
+            self.main_field in instance
+        ), f"field '{self.main_field}' must reside in instance in order to verify its jsonschema correctness. got {instance}"
+        self.recursive_trace_for_type_fields(instance[self.main_field])
+        return instance

splitters.py CHANGED Viewed

@@ -312,11 +312,14 @@ class DiverseLabelsSampler(Sampler):
         sample_size: int,
         instances_pool: List[Dict[str, object]],
         instance: Optional[Dict[str, object]],
     ) -> List[Dict[str, object]]:
         if self.labels_cache is None:
             self.labels_cache = self.divide_by_repr(instances_pool)
         all_labels = list(self.labels_cache.keys())
-        random_generator = get_random_generator_based_on_instance(instance)
         random_generator.shuffle(all_labels)
         from collections import Counter

         sample_size: int,
         instances_pool: List[Dict[str, object]],
         instance: Optional[Dict[str, object]],
+        sampling_seed: Optional[int] = None,
     ) -> List[Dict[str, object]]:
         if self.labels_cache is None:
             self.labels_cache = self.divide_by_repr(instances_pool)
         all_labels = list(self.labels_cache.keys())
+        random_generator = get_random_generator_based_on_instance(
+            instance, local_seed=sampling_seed
+        )
         random_generator.shuffle(all_labels)
         from collections import Counter

text2sql_utils.py CHANGED Viewed

@@ -728,7 +728,7 @@ def sqlparse_queries_equivalent(sql1: str, sql2: str) -> bool:
                 return False
         return True
     except Exception as e:
-        logger.debug(f"Errpr parsing SQL query for comparison: {e}")
         return False
@@ -863,6 +863,8 @@ def compare_dfs_ignore_colnames_subset(
     if df1.empty or df2.empty or len(df1) != len(df2):
         return False
     subset_df, superset_df = (df1, df2) if df1.shape[1] <= df2.shape[1] else (df2, df1)
     if ignore_row_order:

                 return False
         return True
     except Exception as e:
+        logger.debug(f"Error parsing SQL query for comparison: {e}")
         return False
     if df1.empty or df2.empty or len(df1) != len(df2):
         return False
+    df1.columns = range(df1.shape[1])
+    df2.columns = range(df2.shape[1])
     subset_df, superset_df = (df1, df2) if df1.shape[1] <= df2.shape[1] else (df2, df1)
     if ignore_row_order:

version.py CHANGED Viewed

	@@ -1 +1 @@
1	- version = "1.26.6"


1	+ version = "1.26.7"