| | """Pydantic models for ST-WebAgentBench leaderboard submissions. |
| | |
| | Defines the complete submission bundle schema including metadata, |
| | per-task evidence, computed metrics, and integrity manifest. |
| | |
| | Task/policy counts and safety dimensions are computed dynamically |
| | from test.raw.json so the Space auto-adapts when the benchmark grows. |
| | """ |
| |
|
| | import json |
| | import logging |
| | import re |
| | from datetime import datetime, timezone |
| | from pathlib import Path |
| | from typing import List, Optional |
| |
|
| | from pydantic import BaseModel, Field, field_validator |
| |
|
| | from validation.integrity import BENCHMARK_VERSION |
| |
|
| | logger = logging.getLogger(__name__) |
| |
|
| | |
| | |
| | |
| |
|
| | _TASKS_DATA_PATH = Path(__file__).resolve().parent.parent / "data" / "test.raw.json" |
| |
|
| |
|
| | def _load_benchmark_config() -> tuple: |
| | """Load task/policy counts, safety dimensions, web apps, and tiers from test.raw.json. |
| | |
| | Returns (task_count, policy_count, safety_dimensions, dimension_display, |
| | web_applications, tier_config). |
| | """ |
| | if not _TASKS_DATA_PATH.exists(): |
| | raise FileNotFoundError( |
| | f"test.raw.json not found at {_TASKS_DATA_PATH}. " |
| | "This file must be included in the Space deployment." |
| | ) |
| |
|
| | with open(_TASKS_DATA_PATH) as f: |
| | tasks = json.load(f) |
| |
|
| | task_count = len(tasks) |
| | policy_count = sum(len(t.get("policies", [])) for t in tasks) |
| |
|
| | |
| | dim_set = set() |
| | for t in tasks: |
| | for p in t.get("policies", []): |
| | cat = p.get("policy_category", "") |
| | if cat: |
| | dim_set.add(cat) |
| |
|
| | safety_dims = sorted(dim_set) |
| |
|
| | |
| | dim_display = {} |
| | for d in safety_dims: |
| | dim_display[d] = d.replace("_", " ").title().replace("And ", "& ") |
| |
|
| | |
| | web_apps = set() |
| | for t in tasks: |
| | for s in t.get("sites", []): |
| | web_apps.add(s) |
| | web_applications = sorted(web_apps) |
| |
|
| | |
| | |
| | tier_config: dict[str, dict[str, list[int]]] = {} |
| | for t in tasks: |
| | meta = t.get("task_metadata", {}) |
| | if not isinstance(meta, dict): |
| | continue |
| | tier = meta.get("difficulty_tier") |
| | group = meta.get("tier_group") |
| | if tier and group: |
| | tier_config.setdefault(group, {}).setdefault(tier, []).append(t["task_id"]) |
| |
|
| | |
| | task_ids = sorted(t["task_id"] for t in tasks) |
| |
|
| | logger.info( |
| | "Loaded benchmark config: %d tasks, %d policies, %d dimensions, " |
| | "%d web apps, %d tier groups", |
| | task_count, policy_count, len(safety_dims), |
| | len(web_applications), len(tier_config), |
| | ) |
| | return task_count, policy_count, safety_dims, dim_display, web_applications, tier_config, task_ids |
| |
|
| |
|
| | ( |
| | EXPECTED_TASK_COUNT, |
| | EXPECTED_POLICY_COUNT, |
| | SAFETY_DIMENSIONS, |
| | DIMENSION_DISPLAY, |
| | WEB_APPLICATIONS, |
| | TIER_CONFIG, |
| | EXPECTED_TASK_IDS, |
| | ) = _load_benchmark_config() |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| |
|
| | class ActionSummary(BaseModel): |
| | """Lightweight action record for server-side re-evaluation. |
| | |
| | Excludes screenshots, full DOM, and LLM reasoning (too large). |
| | Includes everything needed for offline evaluators. |
| | """ |
| |
|
| | step: int = Field(..., ge=0) |
| | action_type: str = Field(..., description="e.g. click, fill, goto, send_msg_to_user, answer") |
| | action_args: List[str] = Field(default_factory=list) |
| | element_text: str = Field("", description="From state_info, for ActionEvaluator matching") |
| | element_bid: str = Field("", description="From state_info, for CountAction/Sequence matching") |
| | observed_url: str = Field("", description="Page URL after this action") |
| | open_pages_urls: List[str] = Field(default_factory=list) |
| | timestamp: float = Field(..., description="Epoch seconds") |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| |
|
| | class PolicyReport(BaseModel): |
| | """Per-policy evaluation result within a task.""" |
| |
|
| | policy_index: int = Field(..., ge=0) |
| | policy_template_id: str |
| | policy_category: str |
| | source: str = Field(..., description="organization, user, or task") |
| | violated: bool |
| | dormant: bool |
| | violating_step: Optional[int] = None |
| | violating_action: Optional[str] = None |
| | eval_type: Optional[str] = None |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| |
|
| | class TaskEvidence(BaseModel): |
| | """Evidence for a single task execution.""" |
| |
|
| | task_id: int = Field(..., ge=0) |
| |
|
| | |
| | total_reward: float = Field(..., ge=0, le=1) |
| | satisfied_requirements: int = Field(0, ge=0) |
| | total_requirements: int = Field(1, ge=1) |
| | safety_report: List[PolicyReport] |
| |
|
| | |
| | action_sequence: List[ActionSummary] |
| | visited_urls: List[str] = Field(default_factory=list) |
| |
|
| | |
| | start_time: str |
| | end_time: str |
| | num_steps: int = Field(..., ge=0) |
| |
|
| | |
| | trajectory_hash: str |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| |
|
| | class DimensionMetrics(BaseModel): |
| | """Per-safety-dimension aggregated metrics.""" |
| |
|
| | dimension: str |
| | failures: int = Field(..., ge=0) |
| | total_instances: int = Field(..., ge=0) |
| | active_instances: int = Field(..., ge=0) |
| | dormant_count: int = Field(..., ge=0) |
| | risk_ratio: float = Field(..., ge=0, le=1) |
| | active_risk_ratio: float = Field(..., ge=0, le=1) |
| | risk_tier: str |
| | active_risk_tier: str |
| |
|
| |
|
| | class TierMetrics(BaseModel): |
| | """Per-difficulty-tier metrics (Easy/Medium/Hard CRM tasks).""" |
| |
|
| | tier: str |
| | CR: float = Field(..., ge=0, le=1) |
| | CuP: float = Field(..., ge=0, le=1) |
| | semi_CR: float = Field(..., ge=0, le=1) |
| | semi_CuP: float = Field(..., ge=0, le=1) |
| |
|
| |
|
| | class PerAppMetrics(BaseModel): |
| | """Per-application metrics.""" |
| |
|
| | app: str |
| | CR: float = Field(..., ge=0, le=1) |
| | CuP: float = Field(..., ge=0, le=1) |
| | semi_CR: float = Field(0, ge=0, le=1) |
| | semi_CuP: float = Field(0, ge=0, le=1) |
| | task_count: int = Field(..., ge=0) |
| |
|
| |
|
| | class ClaimedMetrics(BaseModel): |
| | """Aggregate metrics claimed by the submitter. |
| | |
| | These are independently recomputed server-side from task_results. |
| | Any discrepancy flags the submission for review. |
| | """ |
| |
|
| | CR: float = Field(..., ge=0, le=1, description="Completion Rate") |
| | CuP: float = Field(..., ge=0, le=1, description="Completion under Policy") |
| | semi_CR: float = Field(..., ge=0, le=1, description="Partial Completion Rate") |
| | semi_CuP: float = Field(..., ge=0, le=1, description="Partial CuP") |
| | all_pass_at_k: Optional[float] = Field(None, ge=0, le=1) |
| | k: Optional[int] = Field(None, ge=1) |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| |
|
| | class SubmissionResults(BaseModel): |
| | """All computed metrics for the submission.""" |
| |
|
| | metrics: ClaimedMetrics |
| | dimensions: List[DimensionMetrics] |
| | tiers: Optional[List[TierMetrics]] = None |
| | apps: Optional[List[PerAppMetrics]] = None |
| | tasks_evaluated: int = Field(..., ge=0) |
| | tasks_total: int = EXPECTED_TASK_COUNT |
| | policies_evaluated: int = Field(..., ge=0) |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| |
|
| | class SubmissionMetadata(BaseModel): |
| | """Agent and team metadata for a leaderboard submission.""" |
| |
|
| | |
| | agent_id: str = Field(..., min_length=1, max_length=128) |
| | model_name: str = Field(..., min_length=1, max_length=256) |
| | team: str = Field(..., min_length=1, max_length=256) |
| | code_repository_url: str = Field( |
| | ..., |
| | min_length=1, |
| | description="Public GitHub/GitLab/HuggingFace repository URL", |
| | ) |
| | contact_email: str = Field( |
| | ..., |
| | min_length=1, |
| | description="Contact email for verification (not displayed publicly)", |
| | ) |
| |
|
| | |
| | paper_url: Optional[str] = None |
| | agent_framework: Optional[str] = None |
| | model_family: Optional[str] = None |
| | is_open_source: Optional[bool] = None |
| | is_open_weights: Optional[bool] = None |
| | cost_per_task_usd: Optional[float] = Field(None, ge=0) |
| | total_cost_usd: Optional[float] = Field(None, ge=0) |
| | hardware: Optional[str] = None |
| | num_runs: int = Field(1, ge=1) |
| | uses_vision: Optional[bool] = None |
| | max_steps: Optional[int] = Field(None, ge=1) |
| | description: Optional[str] = Field(None, max_length=1000) |
| |
|
| | @field_validator("agent_id") |
| | @classmethod |
| | def validate_agent_id(cls, v: str) -> str: |
| | if not re.match(r"^[a-zA-Z0-9_\-\.]+$", v): |
| | raise ValueError( |
| | "agent_id must contain only alphanumeric characters, " |
| | "hyphens, underscores, and dots" |
| | ) |
| | return v |
| |
|
| | @field_validator("code_repository_url") |
| | @classmethod |
| | def validate_repo_url(cls, v: str) -> str: |
| | valid_prefixes = ( |
| | "https://github.com/", |
| | "https://gitlab.com/", |
| | "https://huggingface.co/", |
| | "https://bitbucket.org/", |
| | ) |
| | if not any(v.startswith(p) for p in valid_prefixes): |
| | raise ValueError( |
| | "code_repository_url must be a public GitHub, GitLab, " |
| | "HuggingFace, or Bitbucket URL" |
| | ) |
| | return v |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| |
|
| | class IntegritySection(BaseModel): |
| | """Cryptographic integrity data from the evaluation run.""" |
| |
|
| | run_id: str |
| | benchmark_version: str = BENCHMARK_VERSION |
| | timestamp_start: float |
| | timestamp_end: Optional[float] = None |
| | evaluators_sha256: str |
| | task_config_sha256: str |
| | custom_env_sha256: str |
| | helper_functions_sha256: str |
| | task_hashes: dict |
| | manifest_hash: str |
| | hmac_signature: Optional[str] = Field( |
| | None, |
| | description="HMAC-SHA256 signature (requires ST_BENCH_SIGNING_KEY)", |
| | ) |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| |
|
| | class Submission(BaseModel): |
| | """Complete leaderboard submission bundle. |
| | |
| | Contains metadata, per-task evidence, computed metrics, and |
| | cryptographic integrity data. |
| | """ |
| |
|
| | schema_version: str = Field("1.0", description="Submission schema version") |
| | benchmark_version: str = BENCHMARK_VERSION |
| | submission_date: str = Field( |
| | default_factory=lambda: datetime.now(timezone.utc).isoformat(), |
| | ) |
| | metadata: SubmissionMetadata |
| | results: SubmissionResults |
| | task_evidence: List[TaskEvidence] |
| | integrity: IntegritySection |
| |
|
| | @field_validator("submission_date") |
| | @classmethod |
| | def validate_date(cls, v: str) -> str: |
| | |
| | try: |
| | datetime.fromisoformat(v) |
| | except ValueError as e: |
| | raise ValueError(f"submission_date must be ISO 8601 format: {e}") from e |
| | return v |
| |
|