Spaces:
Sleeping
Sleeping
| """DBA Oracle: per-scenario escalating expert guidance. | |
| Feature-flagged (off by default). When ``enable_dba_oracle=True`` is set | |
| at ``reset(...)`` or ``SQL_DRIFT_ENABLE_DBA_ORACLE=1`` is exported in the | |
| environment, the ``consult_dba`` tool becomes available; three hint | |
| tiers per scenario are shipped here, covering all 10 scenarios (6 static | |
| + 4 drift). | |
| Hints escalate in specificity: tier 1 names the likely failure mode and | |
| the diagnostic to confirm it, tier 2 gives the rewrite invariant, and | |
| tier 3 is a near-spoiler SQL skeleton. The rubric penalizes each consult | |
| (consultation rubric), so the agent only wins by consulting if the hint net-reduces | |
| downstream steps. | |
| """ | |
| from __future__ import annotations | |
| import os | |
| from typing import Final | |
| # --------------------------------------------------------------------------- | |
| # Per-scenario 3-tier hint tables | |
| # --------------------------------------------------------------------------- | |
| _HINTS: Final[dict[str, tuple[str, str, str]]] = { | |
| "01_correlated_subquery": ( | |
| "[DBA tier 1] The expensive shape is a projection-time correlated subquery: " | |
| "a COUNT over `orders` for every `users` row. Confirm by spotting " | |
| "`SELECT COUNT(*) ... WHERE o.user_id = u.id` in the SELECT list or by " | |
| "checking EXPLAIN for repeated dependent work. Preserve one output row per user.", | |
| "[DBA tier 2] Aggregate fulfilled orders once by `user_id`, then LEFT JOIN that " | |
| "small result to `users`. Keep the join outer and wrap the count with " | |
| "`COALESCE(..., 0)` so users with no fulfilled orders stay in the result.", | |
| "[DBA tier 3] Use `SELECT u.id, u.tier, COALESCE(c.n, 0) AS fulfilled_orders " | |
| "FROM users u LEFT JOIN (SELECT user_id, COUNT(*) AS n FROM orders WHERE " | |
| "status = 'fulfilled' GROUP BY user_id) c ON c.user_id = u.id ORDER BY u.id`. " | |
| "Validate the row count equals the number of users.", | |
| ), | |
| "02_select_star_join": ( | |
| "[DBA tier 1] The waste is over-projection: the inner three-way join uses " | |
| "`SELECT *`, including wide product text and order metadata, while the outer " | |
| "query keeps only `order_id`, product `name`, and `qty`.", | |
| "[DBA tier 2] Inline the join and project exactly `oi.order_id`, `p.name`, and " | |
| "`oi.qty`. Keep the products and orders joins plus the filters " | |
| "`p.category = 'books'` and `oi.qty >= 2`; the wrapper exists only to hide " | |
| "the star projection.", | |
| "[DBA tier 3] Rewrite as `SELECT oi.order_id, p.name, oi.qty FROM order_items oi " | |
| "JOIN products p ON p.id = oi.product_id JOIN orders o ON o.id = oi.order_id " | |
| "WHERE p.category = 'books' AND oi.qty >= 2 ORDER BY oi.order_id, p.name`.", | |
| ), | |
| "03_cartesian_join": ( | |
| "[DBA tier 1] This is an accidental cartesian join. `FROM events e, tenants t` " | |
| "combined with `t.id = e.tenant_id + 0` prevents the optimizer from seeing a " | |
| "clean tenant-key join early.", | |
| "[DBA tier 2] Turn the comma join into an explicit equijoin on the tenant key. " | |
| "Move only `t.id = e.tenant_id` into `ON`; keep the severity filter in `WHERE` " | |
| "and preserve grouping by tenant tier.", | |
| "[DBA tier 3] Use `SELECT t.tier, COUNT(*) AS n FROM events e JOIN tenants t " | |
| "ON t.id = e.tenant_id WHERE e.severity IN ('error', 'critical') GROUP BY " | |
| "t.tier ORDER BY t.tier`. Avoid arithmetic on the join key.", | |
| ), | |
| "04_distinct_groupby": ( | |
| "[DBA tier 1] The duplicate-removal work is redundant. `GROUP BY session_id, path` " | |
| "already emits one row per `(session_id, path)` pair, so a leading `DISTINCT` " | |
| "adds a second deduplication pass over grouped rows.", | |
| "[DBA tier 2] Do not introduce a CTE or change the aggregation grain. Remove only " | |
| "`DISTINCT`; keep `COUNT(*) AS hits`, the same GROUP BY keys, and the same " | |
| "ordering so row identity and sort order stay stable.", | |
| "[DBA tier 3] The target shape is `SELECT session_id, path, COUNT(*) AS hits " | |
| "FROM pageviews GROUP BY session_id, path ORDER BY session_id, path`. Validate " | |
| "against the baseline result before comparing runtime.", | |
| ), | |
| "05_nested_subquery": ( | |
| "[DBA tier 1] The nested `IN` clauses express a semi-join: authors who wrote " | |
| "comments on published articles. The important identity is `comments.author_id`, " | |
| "not `articles.author_id`.", | |
| "[DBA tier 2] Flatten to `authors -> comments -> articles`, filter " | |
| "`articles.status = 'published'`, and select distinct author display names. " | |
| "`DISTINCT` is required here because one author can have many qualifying comments.", | |
| "[DBA tier 3] Use `SELECT DISTINCT a.display_name FROM authors a JOIN comments c " | |
| "ON c.author_id = a.id JOIN articles ar ON ar.id = c.article_id WHERE " | |
| "ar.status = 'published' ORDER BY a.display_name`.", | |
| ), | |
| "06_having_as_where": ( | |
| "[DBA tier 1] `status = 'fulfilled'` is a row-level predicate sitting in HAVING, " | |
| "so the engine groups every status first and discards most groups afterward. " | |
| "Only `SUM(amount_cents) >= 100000` truly belongs after aggregation.", | |
| "[DBA tier 2] Move the status filter into `WHERE` before the GROUP BY. Keep " | |
| "`status` in the projection and grouping to preserve the result shape, then " | |
| "leave the aggregate threshold in HAVING.", | |
| "[DBA tier 3] Use `SELECT tenant_id, status, SUM(amount_cents) AS total_cents " | |
| "FROM orders WHERE status = 'fulfilled' GROUP BY tenant_id, status HAVING " | |
| "SUM(amount_cents) >= 100000 ORDER BY tenant_id`.", | |
| ), | |
| "07_drift_column_rename": ( | |
| "[DBA tier 1] If the old aggregation now fails with an unknown `user_id`, this is " | |
| "schema drift rather than a performance issue. Read the changelog or describe " | |
| "`orders`; `users.id` is unchanged, but the order-owner column moved.", | |
| "[DBA tier 2] Replace every reference to `orders.user_id` with `orders.account_id` " | |
| "in SELECT, GROUP BY, JOIN, and ORDER BY positions. Do not change the aggregate " | |
| "logic; the rename preserves row semantics.", | |
| "[DBA tier 3] Submit `SELECT account_id, COUNT(*) AS n_orders, ROUND(SUM(amount), 2) " | |
| "AS total FROM orders GROUP BY account_id ORDER BY account_id`. Validate that " | |
| "counts and totals match the pre-drift business result.", | |
| ), | |
| "08_drift_date_format": ( | |
| "[DBA tier 1] The `events.ts` identifier still exists, but its type changed from " | |
| "ISO text to BIGINT epoch milliseconds. A string date predicate can parse or " | |
| "compare incorrectly; confirm with `describe_table('events')` and samples.", | |
| "[DBA tier 2] Keep the same half-open UTC day window, but express both bounds as " | |
| "epoch-ms integers. For 2026-04-21T00:00:00Z through the next midnight, use " | |
| "`1776729600000 <= ts < 1776816000000`.", | |
| "[DBA tier 3] Use `SELECT kind, COUNT(*) AS n FROM events WHERE ts >= " | |
| "1776729600000 AND ts < 1776816000000 GROUP BY kind ORDER BY kind`. Do not quote " | |
| "the bounds; they must be numeric comparisons against the BIGINT column.", | |
| ), | |
| "09_drift_enum_rule": ( | |
| "[DBA tier 1] A formerly valid equality on `status = 'active'` now silently loses " | |
| "rows because the business state was split into multiple stored labels. Sample " | |
| "`tenants.status` before assuming the old lowercase value still exists.", | |
| '[DBA tier 2] Preserve the business meaning "active tenants" by filtering on the ' | |
| "union of replacement labels. Keep the same grouping by tier and ordering; only " | |
| "the status predicate changes.", | |
| "[DBA tier 3] Use `SELECT tier, COUNT(*) AS n FROM tenants WHERE status IN " | |
| "('ACTIVE', 'ACTIVE_V2') GROUP BY tier ORDER BY tier`. Avoid `LOWER(status) = " | |
| "'active'`; it misses `ACTIVE_V2`.", | |
| ), | |
| "10_drift_field_deprecation": ( | |
| "[DBA tier 1] The inline `posts.author_name` column was normalized away. Describe " | |
| "`posts` and list tables: you should see `posts.users_id` plus a new `users` " | |
| "lookup carrying the human-readable name.", | |
| "[DBA tier 2] Join `posts` to `users` through the new FK, group by `u.full_name`, " | |
| "and alias it back to `author_name` so the result keeps the old report shape. " | |
| "The post count still comes from `posts`.", | |
| "[DBA tier 3] Use `SELECT u.full_name AS author_name, COUNT(*) AS n_posts FROM " | |
| "posts p JOIN users u ON u.id = p.users_id GROUP BY u.full_name ORDER BY " | |
| "u.full_name`.", | |
| ), | |
| } | |
| # --------------------------------------------------------------------------- | |
| # Public API | |
| # --------------------------------------------------------------------------- | |
| def is_enabled( | |
| reset_flag: bool | None = None, *, env_var: str = "SQL_DRIFT_ENABLE_DBA_ORACLE" | |
| ) -> bool: | |
| """Resolve the feature flag from (reset kwarg, env var, default-off).""" | |
| if reset_flag is not None: | |
| return bool(reset_flag) | |
| raw = os.environ.get(env_var, "").strip().lower() | |
| return raw in ("1", "true", "yes", "on") | |
| def get_hint(scenario_id: str, tier: int) -> str: | |
| """Return the hint for ``(scenario_id, tier)``; clamps tier to [1, 3]. | |
| Raises :class:`KeyError` on unknown scenario so tests can detect when | |
| a new scenario was added without a hint table entry. | |
| """ | |
| if scenario_id not in _HINTS: | |
| raise KeyError(f"no DBA hints for scenario_id={scenario_id!r}; known: {sorted(_HINTS)}") | |
| tier = max(1, min(3, int(tier))) | |
| return _HINTS[scenario_id][tier - 1] | |
| def has_hints(scenario_id: str) -> bool: | |
| return scenario_id in _HINTS | |
| def known_scenarios() -> frozenset[str]: | |
| return frozenset(_HINTS) | |
| __all__ = [ | |
| "get_hint", | |
| "has_hints", | |
| "is_enabled", | |
| "known_scenarios", | |
| ] | |