sql-drift-env / actors /dba_oracle.py
visheshrathi's picture
Upload folder using huggingface_hub
5850885 verified
"""DBA Oracle: per-scenario escalating expert guidance.
Feature-flagged (off by default). When ``enable_dba_oracle=True`` is set
at ``reset(...)`` or ``SQL_DRIFT_ENABLE_DBA_ORACLE=1`` is exported in the
environment, the ``consult_dba`` tool becomes available; three hint
tiers per scenario are shipped here, covering all 10 scenarios (6 static
+ 4 drift).
Hints escalate in specificity: tier 1 names the likely failure mode and
the diagnostic to confirm it, tier 2 gives the rewrite invariant, and
tier 3 is a near-spoiler SQL skeleton. The rubric penalizes each consult
(consultation rubric), so the agent only wins by consulting if the hint net-reduces
downstream steps.
"""
from __future__ import annotations
import os
from typing import Final
# ---------------------------------------------------------------------------
# Per-scenario 3-tier hint tables
# ---------------------------------------------------------------------------
_HINTS: Final[dict[str, tuple[str, str, str]]] = {
"01_correlated_subquery": (
"[DBA tier 1] The expensive shape is a projection-time correlated subquery: "
"a COUNT over `orders` for every `users` row. Confirm by spotting "
"`SELECT COUNT(*) ... WHERE o.user_id = u.id` in the SELECT list or by "
"checking EXPLAIN for repeated dependent work. Preserve one output row per user.",
"[DBA tier 2] Aggregate fulfilled orders once by `user_id`, then LEFT JOIN that "
"small result to `users`. Keep the join outer and wrap the count with "
"`COALESCE(..., 0)` so users with no fulfilled orders stay in the result.",
"[DBA tier 3] Use `SELECT u.id, u.tier, COALESCE(c.n, 0) AS fulfilled_orders "
"FROM users u LEFT JOIN (SELECT user_id, COUNT(*) AS n FROM orders WHERE "
"status = 'fulfilled' GROUP BY user_id) c ON c.user_id = u.id ORDER BY u.id`. "
"Validate the row count equals the number of users.",
),
"02_select_star_join": (
"[DBA tier 1] The waste is over-projection: the inner three-way join uses "
"`SELECT *`, including wide product text and order metadata, while the outer "
"query keeps only `order_id`, product `name`, and `qty`.",
"[DBA tier 2] Inline the join and project exactly `oi.order_id`, `p.name`, and "
"`oi.qty`. Keep the products and orders joins plus the filters "
"`p.category = 'books'` and `oi.qty >= 2`; the wrapper exists only to hide "
"the star projection.",
"[DBA tier 3] Rewrite as `SELECT oi.order_id, p.name, oi.qty FROM order_items oi "
"JOIN products p ON p.id = oi.product_id JOIN orders o ON o.id = oi.order_id "
"WHERE p.category = 'books' AND oi.qty >= 2 ORDER BY oi.order_id, p.name`.",
),
"03_cartesian_join": (
"[DBA tier 1] This is an accidental cartesian join. `FROM events e, tenants t` "
"combined with `t.id = e.tenant_id + 0` prevents the optimizer from seeing a "
"clean tenant-key join early.",
"[DBA tier 2] Turn the comma join into an explicit equijoin on the tenant key. "
"Move only `t.id = e.tenant_id` into `ON`; keep the severity filter in `WHERE` "
"and preserve grouping by tenant tier.",
"[DBA tier 3] Use `SELECT t.tier, COUNT(*) AS n FROM events e JOIN tenants t "
"ON t.id = e.tenant_id WHERE e.severity IN ('error', 'critical') GROUP BY "
"t.tier ORDER BY t.tier`. Avoid arithmetic on the join key.",
),
"04_distinct_groupby": (
"[DBA tier 1] The duplicate-removal work is redundant. `GROUP BY session_id, path` "
"already emits one row per `(session_id, path)` pair, so a leading `DISTINCT` "
"adds a second deduplication pass over grouped rows.",
"[DBA tier 2] Do not introduce a CTE or change the aggregation grain. Remove only "
"`DISTINCT`; keep `COUNT(*) AS hits`, the same GROUP BY keys, and the same "
"ordering so row identity and sort order stay stable.",
"[DBA tier 3] The target shape is `SELECT session_id, path, COUNT(*) AS hits "
"FROM pageviews GROUP BY session_id, path ORDER BY session_id, path`. Validate "
"against the baseline result before comparing runtime.",
),
"05_nested_subquery": (
"[DBA tier 1] The nested `IN` clauses express a semi-join: authors who wrote "
"comments on published articles. The important identity is `comments.author_id`, "
"not `articles.author_id`.",
"[DBA tier 2] Flatten to `authors -> comments -> articles`, filter "
"`articles.status = 'published'`, and select distinct author display names. "
"`DISTINCT` is required here because one author can have many qualifying comments.",
"[DBA tier 3] Use `SELECT DISTINCT a.display_name FROM authors a JOIN comments c "
"ON c.author_id = a.id JOIN articles ar ON ar.id = c.article_id WHERE "
"ar.status = 'published' ORDER BY a.display_name`.",
),
"06_having_as_where": (
"[DBA tier 1] `status = 'fulfilled'` is a row-level predicate sitting in HAVING, "
"so the engine groups every status first and discards most groups afterward. "
"Only `SUM(amount_cents) >= 100000` truly belongs after aggregation.",
"[DBA tier 2] Move the status filter into `WHERE` before the GROUP BY. Keep "
"`status` in the projection and grouping to preserve the result shape, then "
"leave the aggregate threshold in HAVING.",
"[DBA tier 3] Use `SELECT tenant_id, status, SUM(amount_cents) AS total_cents "
"FROM orders WHERE status = 'fulfilled' GROUP BY tenant_id, status HAVING "
"SUM(amount_cents) >= 100000 ORDER BY tenant_id`.",
),
"07_drift_column_rename": (
"[DBA tier 1] If the old aggregation now fails with an unknown `user_id`, this is "
"schema drift rather than a performance issue. Read the changelog or describe "
"`orders`; `users.id` is unchanged, but the order-owner column moved.",
"[DBA tier 2] Replace every reference to `orders.user_id` with `orders.account_id` "
"in SELECT, GROUP BY, JOIN, and ORDER BY positions. Do not change the aggregate "
"logic; the rename preserves row semantics.",
"[DBA tier 3] Submit `SELECT account_id, COUNT(*) AS n_orders, ROUND(SUM(amount), 2) "
"AS total FROM orders GROUP BY account_id ORDER BY account_id`. Validate that "
"counts and totals match the pre-drift business result.",
),
"08_drift_date_format": (
"[DBA tier 1] The `events.ts` identifier still exists, but its type changed from "
"ISO text to BIGINT epoch milliseconds. A string date predicate can parse or "
"compare incorrectly; confirm with `describe_table('events')` and samples.",
"[DBA tier 2] Keep the same half-open UTC day window, but express both bounds as "
"epoch-ms integers. For 2026-04-21T00:00:00Z through the next midnight, use "
"`1776729600000 <= ts < 1776816000000`.",
"[DBA tier 3] Use `SELECT kind, COUNT(*) AS n FROM events WHERE ts >= "
"1776729600000 AND ts < 1776816000000 GROUP BY kind ORDER BY kind`. Do not quote "
"the bounds; they must be numeric comparisons against the BIGINT column.",
),
"09_drift_enum_rule": (
"[DBA tier 1] A formerly valid equality on `status = 'active'` now silently loses "
"rows because the business state was split into multiple stored labels. Sample "
"`tenants.status` before assuming the old lowercase value still exists.",
'[DBA tier 2] Preserve the business meaning "active tenants" by filtering on the '
"union of replacement labels. Keep the same grouping by tier and ordering; only "
"the status predicate changes.",
"[DBA tier 3] Use `SELECT tier, COUNT(*) AS n FROM tenants WHERE status IN "
"('ACTIVE', 'ACTIVE_V2') GROUP BY tier ORDER BY tier`. Avoid `LOWER(status) = "
"'active'`; it misses `ACTIVE_V2`.",
),
"10_drift_field_deprecation": (
"[DBA tier 1] The inline `posts.author_name` column was normalized away. Describe "
"`posts` and list tables: you should see `posts.users_id` plus a new `users` "
"lookup carrying the human-readable name.",
"[DBA tier 2] Join `posts` to `users` through the new FK, group by `u.full_name`, "
"and alias it back to `author_name` so the result keeps the old report shape. "
"The post count still comes from `posts`.",
"[DBA tier 3] Use `SELECT u.full_name AS author_name, COUNT(*) AS n_posts FROM "
"posts p JOIN users u ON u.id = p.users_id GROUP BY u.full_name ORDER BY "
"u.full_name`.",
),
}
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def is_enabled(
reset_flag: bool | None = None, *, env_var: str = "SQL_DRIFT_ENABLE_DBA_ORACLE"
) -> bool:
"""Resolve the feature flag from (reset kwarg, env var, default-off)."""
if reset_flag is not None:
return bool(reset_flag)
raw = os.environ.get(env_var, "").strip().lower()
return raw in ("1", "true", "yes", "on")
def get_hint(scenario_id: str, tier: int) -> str:
"""Return the hint for ``(scenario_id, tier)``; clamps tier to [1, 3].
Raises :class:`KeyError` on unknown scenario so tests can detect when
a new scenario was added without a hint table entry.
"""
if scenario_id not in _HINTS:
raise KeyError(f"no DBA hints for scenario_id={scenario_id!r}; known: {sorted(_HINTS)}")
tier = max(1, min(3, int(tier)))
return _HINTS[scenario_id][tier - 1]
def has_hints(scenario_id: str) -> bool:
return scenario_id in _HINTS
def known_scenarios() -> frozenset[str]:
return frozenset(_HINTS)
__all__ = [
"get_hint",
"has_hints",
"is_enabled",
"known_scenarios",
]