Spaces:
Sleeping
Sleeping
| """Scenario spec + instance primitives. | |
| Each concrete scenario file in :mod:`scenarios` exports: | |
| - ``SPEC: ScenarioSpec`` — the immutable metadata (id, family, tags, | |
| optional drift config) plus a bound ``builder`` callable. | |
| The builder takes ``(spec, seed, scale)`` and returns a ready-to-attach | |
| :class:`ScenarioInstance` whose DuckDB connection has been loaded with | |
| deterministic fixtures, ground-truth hashes pre-computed, and baseline | |
| runtime measured. ``base_scale`` is author-tuned per scenario so the | |
| measured baseline clears :data:`BASELINE_MIN_MS` on a single build — | |
| the old timing-driven reroll loop was removed because it coupled the | |
| fixture RNG seed to the retry count, which destroyed determinism | |
| whenever CI hit a jitter-induced retry. | |
| """ | |
| from __future__ import annotations | |
| from collections.abc import Callable | |
| from dataclasses import dataclass, field | |
| from typing import TYPE_CHECKING, Any, Literal | |
| if TYPE_CHECKING: | |
| import duckdb | |
| from utilities.logger import get_module_logger | |
| _LOG = get_module_logger(__name__) | |
| Family = Literal["ecommerce", "events", "cms", "saas_logs", "multitenant"] | |
| DriftKind = Literal["column_rename", "date_format", "enum_rule", "field_deprecation"] | |
| DifficultyLevel = Literal["easy", "normal", "hard"] | |
| # Baseline floor — empirically calibrated, not a fixed marketing target | |
| # 50 ms. DuckDB on in-memory fixtures at CI-reasonable scales (a few | |
| # tens of thousands of rows) measures warm baselines of 0.4–2 ms on | |
| # the anti-pattern queries shipped here; reaching 50 ms would require | |
| # multi-minute fixture builds per scenario, which is untenable for | |
| # both CI and RL rollouts (every reset rebuilds). | |
| # | |
| # 0.3 ms is ~3–5× the median-of-3 warm jitter floor on a quiet CPU | |
| # (observed jitter ~60–100 µs). This SNR is tight but workable because | |
| # the rubric gates the speedup reward at 1.2× before any | |
| # credit is issued, so jitter-induced near-1× "speedups" score zero. | |
| # The cap at 64× bounds upside. A 2× rewrite against a 0.3 ms baseline | |
| # lands at 0.15 ms — still distinguishable from jitter under | |
| # median-of-3 smoothing. | |
| # | |
| # The same floor applies in production and CI — no env-var escape | |
| # hatch — so tests exercise the real reward distribution. Per-scenario | |
| # overrides may raise *or* lower this floor when a scenario's query | |
| # shape has a different natural baseline (see the field docstring on | |
| # :class:`ScenarioSpec.baseline_min_ms`). | |
| BASELINE_MIN_MS = 0.3 | |
| class DriftConfig: | |
| kind: DriftKind | |
| payload: dict[str, Any] | |
| min_step: int = 6 | |
| max_step: int = 12 | |
| cooldown_steps: int = 2 | |
| def __post_init__(self) -> None: | |
| if self.min_step < 1: | |
| raise ValueError("min_step must be >= 1") | |
| if self.max_step < self.min_step: | |
| raise ValueError("max_step must be >= min_step") | |
| if self.cooldown_steps < 0: | |
| raise ValueError("cooldown_steps must be >= 0") | |
| class ScenarioInstance: | |
| """Concretized scenario — ready-to-attach DuckDB fixture + ground truths.""" | |
| conn: duckdb.DuckDBPyConnection | |
| baseline_sql: str | |
| gt_sql_predrift: str | |
| gt_sql_postdrift: str | None | |
| baseline_runtime_ms: float | |
| baseline_tokens: int | |
| gt_result_hash_predrift: str | |
| gt_result_hash_postdrift: str | None | |
| drift_config: DriftConfig | None | |
| schema_synopsis: str | |
| # Drift-distinctive identifier sets consumed by the drift-adapt | |
| # rubric. ``postdrift_identifiers`` marks identifiers/literals | |
| # the correct post-drift rewrite MUST introduce; ``predrift_identifiers`` | |
| # marks identifiers/literals a submission that ignored the drift | |
| # WOULD retain. Together they let the rubric distinguish "adapted" | |
| # from "did not adapt" for drift kinds where a single identifier | |
| # (e.g. ``ts`` under date-format drift) is shared by both sides. | |
| postdrift_identifiers: frozenset[str] = field(default_factory=frozenset) | |
| predrift_identifiers: frozenset[str] = field(default_factory=frozenset) | |
| # Builder signature: (spec, seed, scale) -> (conn, baseline_sql, | |
| # gt_sql_predrift, gt_sql_postdrift, schema_synopsis, | |
| # postdrift_identifiers, predrift_identifiers). | |
| BuilderResult = tuple[ | |
| "duckdb.DuckDBPyConnection", | |
| str, # baseline_sql | |
| str, # gt_sql_predrift | |
| str | None, # gt_sql_postdrift | |
| str, # schema_synopsis | |
| frozenset[str], # postdrift_identifiers | |
| frozenset[str], # predrift_identifiers | |
| ] | |
| BuilderFn = Callable[["ScenarioSpec", int, int], BuilderResult] | |
| class ScenarioSpec: | |
| """Immutable scenario metadata + bound builder.""" | |
| scenario_id: str | |
| family: Family | |
| tags: frozenset[str] | |
| drift_config: DriftConfig | None | |
| builder: BuilderFn | |
| # Row-count scale passed to the builder. Author-tuned so the | |
| # measured baseline clears ``baseline_min_ms`` on a single build; | |
| # materialize() emits a warning (but does not retry) if the floor | |
| # is not met, signalling the author to bump this value. | |
| base_scale: int = 1_000 | |
| # Per-scenario baseline floor override. Most scenarios inherit the | |
| # module default. Scenarios whose query shape naturally lands at a | |
| # very different baseline (e.g. a trivial single-table GROUP BY | |
| # that can't be meaningfully sped up, or a large join whose raw | |
| # shape is already expensive) can pin a different floor with a | |
| # documented rationale at the SPEC site. | |
| baseline_min_ms: float = BASELINE_MIN_MS | |
| def materialize(self, seed: int, *, difficulty: DifficultyLevel = "normal") -> ScenarioInstance: | |
| return materialize(self, seed, difficulty=difficulty) | |
| def count_tokens(sql: str) -> int: | |
| """Rough whitespace/punctuation token count — good enough for baseline.""" | |
| import re | |
| return len(re.findall(r"[\w]+|[^\s\w]", sql)) | |
| def _scale_for_difficulty(base_scale: int, difficulty: DifficultyLevel) -> int: | |
| """Map a coarse difficulty level onto the scenario builder's row-count scale.""" | |
| if difficulty == "easy": | |
| return max(1, base_scale // 2) | |
| if difficulty == "hard": | |
| return base_scale * 2 | |
| return base_scale | |
| def materialize( | |
| spec: ScenarioSpec, seed: int, *, difficulty: DifficultyLevel = "normal" | |
| ) -> ScenarioInstance: | |
| """Build a ScenarioInstance once, measure baseline, and return. | |
| Single build — deterministic, no retry. If the measured baseline is | |
| below ``spec.baseline_min_ms`` a warning is logged so scenario | |
| authors can bump ``base_scale``; the instance is still returned so | |
| episodes can proceed (the rubric gracefully handles small | |
| baselines via the 1.2× speedup gate and infinite-speedup cap). | |
| """ | |
| from engine.profiler import median_of_3_warm_ms | |
| from engine.verifier import canonical_row_hash | |
| scale = _scale_for_difficulty(spec.base_scale, difficulty) | |
| ( | |
| conn, | |
| baseline_sql, | |
| gt_pre, | |
| gt_post, | |
| synopsis, | |
| postdrift_ids, | |
| predrift_ids, | |
| ) = spec.builder(spec, seed, scale) | |
| try: | |
| baseline_ms = median_of_3_warm_ms(conn, baseline_sql) | |
| except Exception: | |
| conn.close() | |
| raise | |
| if baseline_ms < spec.baseline_min_ms: | |
| _LOG.warning( | |
| "%s: baseline %.2fms < %.2fms floor at difficulty=%s scale=%d — bump base_scale", | |
| spec.scenario_id, | |
| baseline_ms, | |
| spec.baseline_min_ms, | |
| difficulty, | |
| scale, | |
| ) | |
| pre_rows = conn.execute(gt_pre).fetchall() | |
| gt_hash_pre = canonical_row_hash(pre_rows) | |
| # Post-drift ground-truth hashes are computed AFTER drift is applied | |
| # at runtime — not here. The env backfills them from gt_post once | |
| # drift fires. | |
| return ScenarioInstance( | |
| conn=conn, | |
| baseline_sql=baseline_sql, | |
| gt_sql_predrift=gt_pre, | |
| gt_sql_postdrift=gt_post, | |
| baseline_runtime_ms=baseline_ms, | |
| baseline_tokens=count_tokens(baseline_sql), | |
| gt_result_hash_predrift=gt_hash_pre, | |
| gt_result_hash_postdrift=None, | |
| drift_config=spec.drift_config, | |
| schema_synopsis=synopsis, | |
| postdrift_identifiers=postdrift_ids, | |
| predrift_identifiers=predrift_ids, | |
| ) | |
| __all__ = [ | |
| "BASELINE_MIN_MS", | |
| "BuilderFn", | |
| "BuilderResult", | |
| "DifficultyLevel", | |
| "DriftConfig", | |
| "DriftKind", | |
| "Family", | |
| "ScenarioInstance", | |
| "ScenarioSpec", | |
| "count_tokens", | |
| "materialize", | |
| ] | |