Spaces:

junaid0600
/

sql-db-engineer-agent

Sleeping

App Files Files Community

junaid0600 commited on Apr 25

Commit

d366e7a

verified ·

1 Parent(s): bb2cfec

Update env/db_simulator.py

Browse files

Files changed (1) hide show

env/db_simulator.py +107 -105

env/db_simulator.py CHANGED Viewed

@@ -1,23 +1,15 @@
 """
 env/db_simulator.py — SQL Database Engineer Agent
 Simulates a production database responding to optimization actions.
-Core mechanism: index coverage reduces query execution time by up to 85-90%.
 """
 import math
-import random
 from typing import Optional
 class DatabaseSimulator:
-    """
-    Simulates a production database that degrades over time.
-    The agent applies optimization actions and sees performance scores change.
-    Performance score: 0-100 (100 = all queries running at target speed).
-    The agent's goal: get performance_score >= target_score.
-    """
     def __init__(self, scenario: dict):
         self.scenario     = scenario
         self.tables       = {t["name"]: dict(t) for t in scenario["tables"]}
@@ -28,6 +20,12 @@ class DatabaseSimulator:
         }
         self.stats_fresh  = {name: False for name in self.tables}
         self.partitioned  = {name: False for name in self.tables}
         self.baseline     = self._compute_score()
         self.history      = [self.baseline]
         self.best_score   = self.baseline
@@ -38,10 +36,6 @@ class DatabaseSimulator:
     # ─────────────────────────────────────────────
     def apply_action(self, action_type: str, payload: dict) -> dict:
-        """
-        Apply an optimization action to the database.
-        Returns delta showing performance change.
-        """
         old_score = self._compute_score()
         affected  = []
@@ -55,11 +49,11 @@ class DatabaseSimulator:
                 self.indexes[table].append(idx_name)
                 affected = self._queries_benefiting_from_index(table, cols)
             else:
-                # Duplicate index — no benefit
                 return {
                     "old_score": old_score, "new_score": old_score,
                     "delta": 0.0, "affected_queries": [],
-                    "improved": False, "message": "Index already exists or table not found."
                 }
         elif action_type == "rewrite_query":
@@ -76,32 +70,38 @@ class DatabaseSimulator:
             table = payload.get("table", "")
             if table in self.tables and not self.partitioned.get(table):
                 self.partitioned[table] = True
-                affected = [q["id"] for q in self.queries if table in q.get("sql", "")]
         elif action_type == "analyze_statistics":
             table = payload.get("table", "")
             if table in self.tables:
                 self.stats_fresh[table] = True
-                affected = [q["id"] for q in self.queries if table in q.get("sql", "")]
         elif action_type == "drop_index":
             table    = payload.get("table", "")
             idx_name = payload.get("index_name", "")
-            if idx_name in self.indexes.get(table, []) and idx_name != "PRIMARY":
                 self.indexes[table].remove(idx_name)
         elif action_type == "add_column":
-            table   = payload.get("table", "")
-            col     = payload.get("column", "")
-            purpose = payload.get("purpose", "")
             if table in self.tables:
                 if "extra_columns" not in self.tables[table]:
                     self.tables[table]["extra_columns"] = []
                 self.tables[table]["extra_columns"].append(col)
-                # Denormalization can help JOINy queries
                 affected = [
                     q["id"] for q in self.queries
-                    if "join" in q.get("sql", "").lower() and table in q.get("sql", "")
                 ]
         new_score = self._compute_score()
@@ -118,60 +118,58 @@ class DatabaseSimulator:
         }
     def inspect_query(self, query_id: str) -> dict:
-        """
-        EXPLAIN a slow query — reveals scan type, rows examined, cost.
-        This is the agent's primary investigation tool.
-        """
         for q in self.queries:
             if q["id"] == query_id:
-                has_index    = self._check_query_index_coverage(q) > 0.1
-                is_partition = self.partitioned.get(q.get("main_table", ""), False)
-                rows_examined = 50 if has_index else q.get("rows_examined",
-                    self.tables.get(q.get("main_table", ""), {}).get("rows", 50000))
                 return {
-                    "query_id":         query_id,
-                    "sql":              q["sql"],
-                    "avg_ms":           q["avg_ms"],
-                    "scan_type":        "INDEX RANGE SCAN" if has_index else "FULL TABLE SCAN",
-                    "rows_examined":    rows_examined,
-                    "partitioned":      is_partition,
                     "optimization_hint": (
                         "Query is using index efficiently."
-                        if has_index
-                        else "No index covering WHERE columns. Consider adding composite index."
                     ),
-                    "main_table":       q.get("main_table", "unknown"),
                 }
         return {"error": f"Query '{query_id}' not found"}
     def analyze_indexes(self, table: str) -> dict:
-        """
-        Show all indexes on a table + usage stats + missing index hints.
-        """
         if table not in self.tables:
             return {"error": f"Table '{table}' not found"}
-        existing   = self.indexes.get(table, [])
-        hints      = [
             h for h in self.scenario.get("missing_index_hints", [])
             if h.get("table") == table
         ]
-        used_by    = []
         for q in self.queries:
             cov = self._check_query_index_coverage(q)
             if table in q.get("sql", "") and cov > 0.1:
                 used_by.append(q["id"])
         return {
-            "table":           table,
-            "row_count":       self.tables[table].get("rows", 0),
             "existing_indexes": existing,
-            "indexes_used_by": used_by,
-            "missing_hints":   hints,
-            "stats_fresh":     self.stats_fresh.get(table, False),
-            "partitioned":     self.partitioned.get(table, False),
-            "size_mb":         self.tables[table].get("size_mb", 0),
         }
     # ─────────────────────────────────────────────
@@ -179,7 +177,6 @@ class DatabaseSimulator:
     # ─────────────────────────────────────────────
     def get_current_state(self) -> dict:
-        """Returns the full current DB state for the Observation."""
         return {
             "performance_score": round(self._compute_score(), 2),
             "baseline_score":    round(self.baseline, 2),
@@ -198,35 +195,57 @@ class DatabaseSimulator:
         return self._compute_score() >= self.target_score
     # ─────────────────────────────────────────────
-    #  INTERNAL SCORING ENGINE
     # ─────────────────────────────────────────────
     def _compute_score(self) -> float:
         """
-        Core scoring: calculates performance score 0-100.
-        Higher = better. Based on how fast queries run given current indexes.
         """
         if not self.queries:
             return 0.0
         scores = []
         for q in self.queries:
-            table       = q.get("main_table", "")
-            coverage    = self._check_query_index_coverage(q)
-            part_bonus  = 0.30 if self.partitioned.get(table, False) else 0.0
-            stats_bonus = 0.05 if self.stats_fresh.get(table, False) else 0.0
-            total_reduction = min(coverage * 0.85 + part_bonus + stats_bonus, 0.97)
-            effective_ms    = q["avg_ms"] * (1 - total_reduction)
-            # Score formula: 100ms = score 99, 1000ms = score 90, 8500ms = ~14
-            score = max(0.0, 100.0 - (effective_ms / 100.0))
-            scores.append(score)
-        return round(sum(scores) / len(scores), 2)
     def _check_query_index_coverage(self, query: dict) -> float:
         """
-        Returns 0.0-1.0 representing how well indexes cover this query's WHERE clause.
-        0.0 = full table scan, 1.0 = perfect index coverage.
         """
         sql = query.get("sql", "").lower()
         for table, indexes in self.indexes.items():
@@ -234,59 +253,42 @@ class DatabaseSimulator:
                 continue
             for idx in indexes:
                 if idx == "PRIMARY":
-                    # Primary key only helps if query filters by primary key
                     if "where id=" in sql or "where id =" in sql:
                         return 0.95
                     continue
-                # Extract columns from index name (idx_col1_col2)
-                cols = idx.replace("idx_", "").split("_")
                 matches = sum(1 for c in cols if c in sql)
                 if matches >= 2:
-                    return 0.90  # Composite index — excellent coverage
                 if matches == 1:
-                    return 0.60  # Single column — partial coverage
         return 0.0
-    def _queries_benefiting_from_index(self, table: str, cols: list) -> list:
-        """Returns query IDs that would benefit from an index on given table/columns."""
-        benefiting = []
-        for q in self.queries:
-            sql = q.get("sql", "").lower()
-            if table in sql and any(c.lower() in sql for c in cols):
-                benefiting.append(q["id"])
-        return benefiting
     def _estimate_rewrite(self, new_sql: str, query: dict) -> float:
-        """
-        Estimates improvement factor from a query rewrite (0.0 to 0.70).
-        Checks for common optimization patterns.
-        """
         new_lower = new_sql.lower()
         old_lower = query.get("sql", "").lower()
         improvement = 0.0
-        # Remove SELECT * → specific columns
         if "select *" not in new_lower and "select *" in old_lower:
             improvement += 0.20
-        # Add LIMIT clause
         if "limit " in new_lower and "limit " not in old_lower:
             improvement += 0.15
-        # Use EXISTS instead of IN subquery
         if "exists" in new_lower and "in (select" in old_lower:
             improvement += 0.25
-        # Use INNER JOIN instead of implicit cross join
-        if "inner join" in new_lower and "," in old_lower and "join" not in old_lower:
             improvement += 0.30
-        # Add WHERE clause that was missing
         if "where" in new_lower and "where" not in old_lower:
             improvement += 0.35
-        # Use COALESCE / ISNULL
         if "coalesce" in new_lower:
             improvement += 0.05
-        return min(improvement, 0.70)

 """
 env/db_simulator.py — SQL Database Engineer Agent
 Simulates a production database responding to optimization actions.
+Core fix: _compute_score() now interpolates from JSON baseline → target
+so baseline matches scenario JSON (e.g. 8.0 not 80.0).
 """
 import math
 from typing import Optional
 class DatabaseSimulator:
     def __init__(self, scenario: dict):
         self.scenario     = scenario
         self.tables       = {t["name"]: dict(t) for t in scenario["tables"]}
         }
         self.stats_fresh  = {name: False for name in self.tables}
         self.partitioned  = {name: False for name in self.tables}
+        # Store original ms for detecting rewrite improvements
+        self._original_query_ms = {
+            q["id"]: q["avg_ms"] for q in scenario["slow_queries"]
+        }
         self.baseline     = self._compute_score()
         self.history      = [self.baseline]
         self.best_score   = self.baseline
     # ─────────────────────────────────────────────
     def apply_action(self, action_type: str, payload: dict) -> dict:
         old_score = self._compute_score()
         affected  = []
                 self.indexes[table].append(idx_name)
                 affected = self._queries_benefiting_from_index(table, cols)
             else:
                 return {
                     "old_score": old_score, "new_score": old_score,
                     "delta": 0.0, "affected_queries": [],
+                    "improved": False,
+                    "message": "Index already exists or table not found."
                 }
         elif action_type == "rewrite_query":
             table = payload.get("table", "")
             if table in self.tables and not self.partitioned.get(table):
                 self.partitioned[table] = True
+                affected = [
+                    q["id"] for q in self.queries
+                    if table in q.get("sql", "")
+                ]
         elif action_type == "analyze_statistics":
             table = payload.get("table", "")
             if table in self.tables:
                 self.stats_fresh[table] = True
+                affected = [
+                    q["id"] for q in self.queries
+                    if table in q.get("sql", "")
+                ]
         elif action_type == "drop_index":
             table    = payload.get("table", "")
             idx_name = payload.get("index_name", "")
+            if (idx_name in self.indexes.get(table, [])
+                    and idx_name != "PRIMARY"):
                 self.indexes[table].remove(idx_name)
         elif action_type == "add_column":
+            table = payload.get("table", "")
+            col   = payload.get("column", "")
             if table in self.tables:
                 if "extra_columns" not in self.tables[table]:
                     self.tables[table]["extra_columns"] = []
                 self.tables[table]["extra_columns"].append(col)
                 affected = [
                     q["id"] for q in self.queries
+                    if "join" in q.get("sql", "").lower()
+                    and table in q.get("sql", "")
                 ]
         new_score = self._compute_score()
         }
     def inspect_query(self, query_id: str) -> dict:
         for q in self.queries:
             if q["id"] == query_id:
+                has_index     = self._check_query_index_coverage(q) > 0.1
+                is_partitioned = self.partitioned.get(
+                    q.get("main_table", ""), False
+                )
+                rows_examined = 50 if has_index else q.get(
+                    "rows_examined",
+                    self.tables.get(
+                        q.get("main_table", ""), {}
+                    ).get("rows", 50000)
+                )
                 return {
+                    "query_id":          query_id,
+                    "sql":               q["sql"],
+                    "avg_ms":            q["avg_ms"],
+                    "scan_type":         "INDEX RANGE SCAN" if has_index
+                                         else "FULL TABLE SCAN",
+                    "rows_examined":     rows_examined,
+                    "partitioned":       is_partitioned,
                     "optimization_hint": (
                         "Query is using index efficiently."
+                        if has_index else
+                        "No index covering WHERE columns. "
+                        "Consider adding composite index."
                     ),
+                    "main_table": q.get("main_table", "unknown"),
                 }
         return {"error": f"Query '{query_id}' not found"}
     def analyze_indexes(self, table: str) -> dict:
         if table not in self.tables:
             return {"error": f"Table '{table}' not found"}
+        existing = self.indexes.get(table, [])
+        hints    = [
             h for h in self.scenario.get("missing_index_hints", [])
             if h.get("table") == table
         ]
+        used_by = []
         for q in self.queries:
             cov = self._check_query_index_coverage(q)
             if table in q.get("sql", "") and cov > 0.1:
                 used_by.append(q["id"])
         return {
+            "table":            table,
+            "row_count":        self.tables[table].get("rows", 0),
             "existing_indexes": existing,
+            "indexes_used_by":  used_by,
+            "missing_hints":    hints,
+            "stats_fresh":      self.stats_fresh.get(table, False),
+            "partitioned":      self.partitioned.get(table, False),
+            "size_mb":          self.tables[table].get("size_mb", 0),
         }
     # ─────────────────────────────────────────────
     # ─────────────────────────────────────────────
     def get_current_state(self) -> dict:
         return {
             "performance_score": round(self._compute_score(), 2),
             "baseline_score":    round(self.baseline, 2),
         return self._compute_score() >= self.target_score
     # ─────────────────────────────────────────────
+    #  INTERNAL SCORING ENGINE — FIXED
     # ─────────────────────────────────────────────
     def _compute_score(self) -> float:
         """
+        FIXED: Interpolates from json_baseline → target_score
+        based on index coverage + rewrite improvements.
+        Before fix: used raw ms formula → gave 80 when JSON said 8
+        After fix:  no index = json_baseline, full index = target_score
         """
         if not self.queries:
             return 0.0
+        json_baseline = self.scenario.get("performance_score_baseline", 50.0)
+        target        = self.scenario.get("target_score", 85.0)
         scores = []
         for q in self.queries:
+            table = q.get("main_table", "")
+            # ── Index coverage improvement ────────────────────────
+            coverage    = self._check_query_index_coverage(q)
+            part_bonus  = 0.25 if self.partitioned.get(table, False) else 0.0
+            stats_bonus = 0.04 if self.stats_fresh.get(table, False) else 0.0
+            index_improvement = min(coverage + part_bonus + stats_bonus, 0.95)
+            # ── Query rewrite improvement ─────────────────────────
+            original_ms    = self._original_query_ms.get(q["id"], q["avg_ms"])
+            rewrite_factor = max(
+                0.0,
+                1.0 - q["avg_ms"] / max(1, original_ms)
+            )
+            rewrite_improvement = rewrite_factor * 0.40
+            # ── Combined improvement fraction (0 → 1) ─────────────
+            combined = min(index_improvement + rewrite_improvement, 1.0)
+            # ── Interpolate: baseline → target ────────────────────
+            q_score = json_baseline + (target - json_baseline) * combined
+            scores.append(q_score)
+        return round(
+            min(100.0, max(0.0, sum(scores) / len(scores))),
+            2
+        )
     def _check_query_index_coverage(self, query: dict) -> float:
         """
+        Returns 0.0-1.0: how well indexes cover this query's WHERE clause.
+        0.0 = full table scan, 0.9 = composite index match.
         """
         sql = query.get("sql", "").lower()
         for table, indexes in self.indexes.items():
                 continue
             for idx in indexes:
                 if idx == "PRIMARY":
                     if "where id=" in sql or "where id =" in sql:
                         return 0.95
                     continue
+                cols    = idx.replace("idx_", "").split("_")
                 matches = sum(1 for c in cols if c in sql)
                 if matches >= 2:
+                    return 0.90   # Composite — excellent
                 if matches == 1:
+                    return 0.60   # Single column — partial
         return 0.0
+    def _queries_benefiting_from_index(
+        self, table: str, cols: list
+    ) -> list:
+        return [
+            q["id"] for q in self.queries
+            if table in q.get("sql", "").lower()
+            and any(c.lower() in q.get("sql", "").lower() for c in cols)
+        ]
     def _estimate_rewrite(self, new_sql: str, query: dict) -> float:
         new_lower = new_sql.lower()
         old_lower = query.get("sql", "").lower()
         improvement = 0.0
         if "select *" not in new_lower and "select *" in old_lower:
             improvement += 0.20
         if "limit " in new_lower and "limit " not in old_lower:
             improvement += 0.15
         if "exists" in new_lower and "in (select" in old_lower:
             improvement += 0.25
+        if ("inner join" in new_lower
+                and "," in old_lower
+                and "join" not in old_lower):
             improvement += 0.30
         if "where" in new_lower and "where" not in old_lower:
             improvement += 0.35
         if "coalesce" in new_lower:
             improvement += 0.05
+        return min(improvement, 0.70)