Spaces:

visheshrathi
/

sql-drift-env

Sleeping

App Files Files Community

sql-drift-env / actors /dba_oracle.py

visheshrathi

Upload folder using huggingface_hub

5850885 verified about 1 month ago

raw

history blame contribute delete

9.97 kB

	"""DBA Oracle: per-scenario escalating expert guidance.

	Feature-flagged (off by default). When ``enable_dba_oracle=True`` is set
	at ``reset(...)`` or ``SQL_DRIFT_ENABLE_DBA_ORACLE=1`` is exported in the
	environment, the ``consult_dba`` tool becomes available; three hint
	tiers per scenario are shipped here, covering all 10 scenarios (6 static
	+ 4 drift).

	Hints escalate in specificity: tier 1 names the likely failure mode and
	the diagnostic to confirm it, tier 2 gives the rewrite invariant, and
	tier 3 is a near-spoiler SQL skeleton. The rubric penalizes each consult
	(consultation rubric), so the agent only wins by consulting if the hint net-reduces
	downstream steps.
	"""

	from __future__ import annotations

	import os
	from typing import Final

	# ---------------------------------------------------------------------------
	# Per-scenario 3-tier hint tables
	# ---------------------------------------------------------------------------


	_HINTS: Final[dict[str, tuple[str, str, str]]] = {
	"01_correlated_subquery": (
	"[DBA tier 1] The expensive shape is a projection-time correlated subquery: "
	"a COUNT over `orders` for every `users` row. Confirm by spotting "
	"`SELECT COUNT(*) ... WHERE o.user_id = u.id` in the SELECT list or by "
	"checking EXPLAIN for repeated dependent work. Preserve one output row per user.",
	"[DBA tier 2] Aggregate fulfilled orders once by `user_id`, then LEFT JOIN that "
	"small result to `users`. Keep the join outer and wrap the count with "
	"`COALESCE(..., 0)` so users with no fulfilled orders stay in the result.",
	"[DBA tier 3] Use `SELECT u.id, u.tier, COALESCE(c.n, 0) AS fulfilled_orders "
	"FROM users u LEFT JOIN (SELECT user_id, COUNT(*) AS n FROM orders WHERE "
	"status = 'fulfilled' GROUP BY user_id) c ON c.user_id = u.id ORDER BY u.id`. "
	"Validate the row count equals the number of users.",
	),
	"02_select_star_join": (
	"[DBA tier 1] The waste is over-projection: the inner three-way join uses "
	"`SELECT *`, including wide product text and order metadata, while the outer "
	"query keeps only `order_id`, product `name`, and `qty`.",
	"[DBA tier 2] Inline the join and project exactly `oi.order_id`, `p.name`, and "
	"`oi.qty`. Keep the products and orders joins plus the filters "
	"`p.category = 'books'` and `oi.qty >= 2`; the wrapper exists only to hide "
	"the star projection.",
	"[DBA tier 3] Rewrite as `SELECT oi.order_id, p.name, oi.qty FROM order_items oi "
	"JOIN products p ON p.id = oi.product_id JOIN orders o ON o.id = oi.order_id "
	"WHERE p.category = 'books' AND oi.qty >= 2 ORDER BY oi.order_id, p.name`.",
	),
	"03_cartesian_join": (
	"[DBA tier 1] This is an accidental cartesian join. `FROM events e, tenants t` "
	"combined with `t.id = e.tenant_id + 0` prevents the optimizer from seeing a "
	"clean tenant-key join early.",
	"[DBA tier 2] Turn the comma join into an explicit equijoin on the tenant key. "
	"Move only `t.id = e.tenant_id` into `ON`; keep the severity filter in `WHERE` "
	"and preserve grouping by tenant tier.",
	"[DBA tier 3] Use `SELECT t.tier, COUNT(*) AS n FROM events e JOIN tenants t "
	"ON t.id = e.tenant_id WHERE e.severity IN ('error', 'critical') GROUP BY "
	"t.tier ORDER BY t.tier`. Avoid arithmetic on the join key.",
	),
	"04_distinct_groupby": (
	"[DBA tier 1] The duplicate-removal work is redundant. `GROUP BY session_id, path` "
	"already emits one row per `(session_id, path)` pair, so a leading `DISTINCT` "
	"adds a second deduplication pass over grouped rows.",
	"[DBA tier 2] Do not introduce a CTE or change the aggregation grain. Remove only "
	"`DISTINCT`; keep `COUNT(*) AS hits`, the same GROUP BY keys, and the same "
	"ordering so row identity and sort order stay stable.",
	"[DBA tier 3] The target shape is `SELECT session_id, path, COUNT(*) AS hits "
	"FROM pageviews GROUP BY session_id, path ORDER BY session_id, path`. Validate "
	"against the baseline result before comparing runtime.",
	),
	"05_nested_subquery": (
	"[DBA tier 1] The nested `IN` clauses express a semi-join: authors who wrote "
	"comments on published articles. The important identity is `comments.author_id`, "
	"not `articles.author_id`.",
	"[DBA tier 2] Flatten to `authors -> comments -> articles`, filter "
	"`articles.status = 'published'`, and select distinct author display names. "
	"`DISTINCT` is required here because one author can have many qualifying comments.",
	"[DBA tier 3] Use `SELECT DISTINCT a.display_name FROM authors a JOIN comments c "
	"ON c.author_id = a.id JOIN articles ar ON ar.id = c.article_id WHERE "
	"ar.status = 'published' ORDER BY a.display_name`.",
	),
	"06_having_as_where": (
	"[DBA tier 1] `status = 'fulfilled'` is a row-level predicate sitting in HAVING, "
	"so the engine groups every status first and discards most groups afterward. "
	"Only `SUM(amount_cents) >= 100000` truly belongs after aggregation.",
	"[DBA tier 2] Move the status filter into `WHERE` before the GROUP BY. Keep "
	"`status` in the projection and grouping to preserve the result shape, then "
	"leave the aggregate threshold in HAVING.",
	"[DBA tier 3] Use `SELECT tenant_id, status, SUM(amount_cents) AS total_cents "
	"FROM orders WHERE status = 'fulfilled' GROUP BY tenant_id, status HAVING "
	"SUM(amount_cents) >= 100000 ORDER BY tenant_id`.",
	),
	"07_drift_column_rename": (
	"[DBA tier 1] If the old aggregation now fails with an unknown `user_id`, this is "
	"schema drift rather than a performance issue. Read the changelog or describe "
	"`orders`; `users.id` is unchanged, but the order-owner column moved.",
	"[DBA tier 2] Replace every reference to `orders.user_id` with `orders.account_id` "
	"in SELECT, GROUP BY, JOIN, and ORDER BY positions. Do not change the aggregate "
	"logic; the rename preserves row semantics.",
	"[DBA tier 3] Submit `SELECT account_id, COUNT(*) AS n_orders, ROUND(SUM(amount), 2) "
	"AS total FROM orders GROUP BY account_id ORDER BY account_id`. Validate that "
	"counts and totals match the pre-drift business result.",
	),
	"08_drift_date_format": (
	"[DBA tier 1] The `events.ts` identifier still exists, but its type changed from "
	"ISO text to BIGINT epoch milliseconds. A string date predicate can parse or "
	"compare incorrectly; confirm with `describe_table('events')` and samples.",
	"[DBA tier 2] Keep the same half-open UTC day window, but express both bounds as "
	"epoch-ms integers. For 2026-04-21T00:00:00Z through the next midnight, use "
	"`1776729600000 <= ts < 1776816000000`.",
	"[DBA tier 3] Use `SELECT kind, COUNT(*) AS n FROM events WHERE ts >= "
	"1776729600000 AND ts < 1776816000000 GROUP BY kind ORDER BY kind`. Do not quote "
	"the bounds; they must be numeric comparisons against the BIGINT column.",
	),
	"09_drift_enum_rule": (
	"[DBA tier 1] A formerly valid equality on `status = 'active'` now silently loses "
	"rows because the business state was split into multiple stored labels. Sample "
	"`tenants.status` before assuming the old lowercase value still exists.",
	'[DBA tier 2] Preserve the business meaning "active tenants" by filtering on the '
	"union of replacement labels. Keep the same grouping by tier and ordering; only "
	"the status predicate changes.",
	"[DBA tier 3] Use `SELECT tier, COUNT(*) AS n FROM tenants WHERE status IN "
	"('ACTIVE', 'ACTIVE_V2') GROUP BY tier ORDER BY tier`. Avoid `LOWER(status) = "
	"'active'`; it misses `ACTIVE_V2`.",
	),
	"10_drift_field_deprecation": (
	"[DBA tier 1] The inline `posts.author_name` column was normalized away. Describe "
	"`posts` and list tables: you should see `posts.users_id` plus a new `users` "
	"lookup carrying the human-readable name.",
	"[DBA tier 2] Join `posts` to `users` through the new FK, group by `u.full_name`, "
	"and alias it back to `author_name` so the result keeps the old report shape. "
	"The post count still comes from `posts`.",
	"[DBA tier 3] Use `SELECT u.full_name AS author_name, COUNT(*) AS n_posts FROM "
	"posts p JOIN users u ON u.id = p.users_id GROUP BY u.full_name ORDER BY "
	"u.full_name`.",
	),
	}


	# ---------------------------------------------------------------------------
	# Public API
	# ---------------------------------------------------------------------------


	def is_enabled(
	reset_flag: bool \| None = None, *, env_var: str = "SQL_DRIFT_ENABLE_DBA_ORACLE"
	) -> bool:
	"""Resolve the feature flag from (reset kwarg, env var, default-off)."""
	if reset_flag is not None:
	return bool(reset_flag)
	raw = os.environ.get(env_var, "").strip().lower()
	return raw in ("1", "true", "yes", "on")


	def get_hint(scenario_id: str, tier: int) -> str:
	"""Return the hint for ``(scenario_id, tier)``; clamps tier to [1, 3].

	Raises :class:`KeyError` on unknown scenario so tests can detect when
	a new scenario was added without a hint table entry.
	"""
	if scenario_id not in _HINTS:
	raise KeyError(f"no DBA hints for scenario_id={scenario_id!r}; known: {sorted(_HINTS)}")
	tier = max(1, min(3, int(tier)))
	return _HINTS[scenario_id][tier - 1]


	def has_hints(scenario_id: str) -> bool:
	return scenario_id in _HINTS


	def known_scenarios() -> frozenset[str]:
	return frozenset(_HINTS)


	__all__ = [
	"get_hint",
	"has_hints",
	"is_enabled",
	"known_scenarios",
	]