""" DABstep Benchmark Utilities Adapted from: https://huggingface.co/spaces/adyen/DABstep/blob/main/dabstep_benchmark/utils.py """ from __future__ import annotations import re import pandas as pd from dabstep_benchmark.evaluation.scorer import question_scorer def format_error(msg: str) -> str: """Format an error message in red.""" return f"

{msg}

" def format_warning(msg: str) -> str: """Format a warning message in orange.""" return f"

{msg}

" def format_log(msg: str) -> str: """Format a log message in green.""" return f"

{msg}

" def model_hyperlink(link: str, model_name: str) -> str: """Create a hyperlink for a model.""" return f'{model_name}' def is_valid_https_url(url: str) -> bool: """Validate that a URL is a valid HTTPS URL.""" pattern = re.compile( r'^https://' # URL must start with 'https://' r'(?!10(?:\.\d{1,3}){3})' # Exclude private IP 10.x.x.x r'(?!127(?:\.\d{1,3}){3})' # Exclude loopback IP 127.x.x.x r'(?!169\.254(?:\.\d{1,3}){2})' # Exclude link-local IP 169.254.x.x r'(?!192\.168(?:\.\d{1,3}){2})' # Exclude private IP 192.168.x.x r'(?!172\.(?:1[6-9]|2[0-9]|3[0-1])(?:\.\d{1,3}){2})' # Exclude private IP 172.16.x.x - 172.31.x.x r'(?:(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,})' # Match domain name r'(?::\d{2,5})?' # Optional port r'(?:/[^\s]*)?$', # Optional path re.IGNORECASE ) return re.match(pattern, url) is not None def evaluate( agent_answers: pd.DataFrame, tasks_with_gt: pd.DataFrame, submission_id: str = "" ) -> list[dict]: """ Evaluate agent answers against ground truth. Args: agent_answers: DataFrame with columns 'task_id' and 'agent_answer' tasks_with_gt: DataFrame with columns 'task_id', 'answer', and 'level' submission_id: Identifier for the submission Returns: List of score dictionaries for each task """ task_scores = [] for _, row in tasks_with_gt.iterrows(): correct_answer = row["answer"] level = str(row["level"]) task_id = str(row["task_id"]) if task_id not in agent_answers["task_id"].values: raise KeyError(f"Task ID: {task_id} not found. Are you sure you submitted the correct file?") agent_answer = agent_answers.loc[agent_answers.task_id == task_id, "agent_answer"].values[0] score = question_scorer(agent_answer, correct_answer) task_scores.append({ "submission_id": submission_id, "task_id": task_id, "score": score, "level": level, "agent_answer": agent_answer, }) return task_scores