Spaces:

junaid0600
/

sql-db-engineer-agent

Sleeping

App Files Files Community

junaid0600 commited on Apr 25

Commit

9f883be

verified ·

1 Parent(s): c750426

Update env/db_simulator.py

Browse files

Files changed (1) hide show

env/db_simulator.py +104 -106

env/db_simulator.py CHANGED Viewed

@@ -1,15 +1,23 @@
 """
 env/db_simulator.py — SQL Database Engineer Agent
 Simulates a production database responding to optimization actions.
-Core fix: _compute_score() now interpolates from JSON baseline → target
-so baseline matches scenario JSON (e.g. 8.0 not 80.0).
 """
 import math
 from typing import Optional
 class DatabaseSimulator:
     def __init__(self, scenario: dict):
         self.scenario     = scenario
         self.tables       = {t["name"]: dict(t) for t in scenario["tables"]}
@@ -20,12 +28,6 @@ class DatabaseSimulator:
         }
         self.stats_fresh  = {name: False for name in self.tables}
         self.partitioned  = {name: False for name in self.tables}
-        # Store original ms for detecting rewrite improvements
-        self._original_query_ms = {
-            q["id"]: q["avg_ms"] for q in scenario["slow_queries"]
-        }
         self.baseline     = self._compute_score()
         self.history      = [self.baseline]
         self.best_score   = self.baseline
@@ -36,6 +38,10 @@ class DatabaseSimulator:
     # ─────────────────────────────────────────────
     def apply_action(self, action_type: str, payload: dict) -> dict:
         old_score = self._compute_score()
         affected  = []
@@ -49,11 +55,11 @@ class DatabaseSimulator:
                 self.indexes[table].append(idx_name)
                 affected = self._queries_benefiting_from_index(table, cols)
             else:
                 return {
                     "old_score": old_score, "new_score": old_score,
                     "delta": 0.0, "affected_queries": [],
-                    "improved": False,
-                    "message": "Index already exists or table not found."
                 }
         elif action_type == "rewrite_query":
@@ -70,38 +76,32 @@ class DatabaseSimulator:
             table = payload.get("table", "")
             if table in self.tables and not self.partitioned.get(table):
                 self.partitioned[table] = True
-                affected = [
-                    q["id"] for q in self.queries
-                    if table in q.get("sql", "")
-                ]
         elif action_type == "analyze_statistics":
             table = payload.get("table", "")
             if table in self.tables:
                 self.stats_fresh[table] = True
-                affected = [
-                    q["id"] for q in self.queries
-                    if table in q.get("sql", "")
-                ]
         elif action_type == "drop_index":
             table    = payload.get("table", "")
             idx_name = payload.get("index_name", "")
-            if (idx_name in self.indexes.get(table, [])
-                    and idx_name != "PRIMARY"):
                 self.indexes[table].remove(idx_name)
         elif action_type == "add_column":
-            table = payload.get("table", "")
-            col   = payload.get("column", "")
             if table in self.tables:
                 if "extra_columns" not in self.tables[table]:
                     self.tables[table]["extra_columns"] = []
                 self.tables[table]["extra_columns"].append(col)
                 affected = [
                     q["id"] for q in self.queries
-                    if "join" in q.get("sql", "").lower()
-                    and table in q.get("sql", "")
                 ]
         new_score = self._compute_score()
@@ -118,58 +118,60 @@ class DatabaseSimulator:
         }
     def inspect_query(self, query_id: str) -> dict:
         for q in self.queries:
             if q["id"] == query_id:
-                has_index     = self._check_query_index_coverage(q) > 0.1
-                is_partitioned = self.partitioned.get(
-                    q.get("main_table", ""), False
-                )
-                rows_examined = 50 if has_index else q.get(
-                    "rows_examined",
-                    self.tables.get(
-                        q.get("main_table", ""), {}
-                    ).get("rows", 50000)
-                )
                 return {
-                    "query_id":          query_id,
-                    "sql":               q["sql"],
-                    "avg_ms":            q["avg_ms"],
-                    "scan_type":         "INDEX RANGE SCAN" if has_index
-                                         else "FULL TABLE SCAN",
-                    "rows_examined":     rows_examined,
-                    "partitioned":       is_partitioned,
                     "optimization_hint": (
                         "Query is using index efficiently."
-                        if has_index else
-                        "No index covering WHERE columns. "
-                        "Consider adding composite index."
                     ),
-                    "main_table": q.get("main_table", "unknown"),
                 }
         return {"error": f"Query '{query_id}' not found"}
     def analyze_indexes(self, table: str) -> dict:
         if table not in self.tables:
             return {"error": f"Table '{table}' not found"}
-        existing = self.indexes.get(table, [])
-        hints    = [
             h for h in self.scenario.get("missing_index_hints", [])
             if h.get("table") == table
         ]
-        used_by = []
         for q in self.queries:
             cov = self._check_query_index_coverage(q)
             if table in q.get("sql", "") and cov > 0.1:
                 used_by.append(q["id"])
         return {
-            "table":            table,
-            "row_count":        self.tables[table].get("rows", 0),
             "existing_indexes": existing,
-            "indexes_used_by":  used_by,
-            "missing_hints":    hints,
-            "stats_fresh":      self.stats_fresh.get(table, False),
-            "partitioned":      self.partitioned.get(table, False),
-            "size_mb":          self.tables[table].get("size_mb", 0),
         }
     # ─────────────────────────────────────────────
@@ -177,6 +179,7 @@ class DatabaseSimulator:
     # ─────────────────────────────────────────────
     def get_current_state(self) -> dict:
         return {
             "performance_score": round(self._compute_score(), 2),
             "baseline_score":    round(self.baseline, 2),
@@ -195,57 +198,35 @@ class DatabaseSimulator:
         return self._compute_score() >= self.target_score
     # ─────────────────────────────────────────────
-    #  INTERNAL SCORING ENGINE — FIXED
     # ─────────────────────────────────────────────
     def _compute_score(self) -> float:
         """
-        FIXED: Interpolates from json_baseline → target_score
-        based on index coverage + rewrite improvements.
-        Before fix: used raw ms formula → gave 80 when JSON said 8
-        After fix:  no index = json_baseline, full index = target_score
         """
         if not self.queries:
             return 0.0
-        json_baseline = self.scenario.get("performance_score_baseline", 50.0)
-        target        = self.scenario.get("target_score", 85.0)
         scores = []
         for q in self.queries:
-            table = q.get("main_table", "")
-            # ── Index coverage improvement ────────────────────────
             coverage    = self._check_query_index_coverage(q)
-            part_bonus  = 0.25 if self.partitioned.get(table, False) else 0.0
-            stats_bonus = 0.04 if self.stats_fresh.get(table, False) else 0.0
-            index_improvement = min(coverage + part_bonus + stats_bonus, 0.95)
-            # ── Query rewrite improvement ─────────────────────────
-            original_ms    = self._original_query_ms.get(q["id"], q["avg_ms"])
-            rewrite_factor = max(
-                0.0,
-                1.0 - q["avg_ms"] / max(1, original_ms)
-            )
-            rewrite_improvement = rewrite_factor * 0.40
-            # ── Combined improvement fraction (0 → 1) ─────────────
-            combined = min(index_improvement + rewrite_improvement, 1.0)
-            # ── Interpolate: baseline → target ────────────────────
-            q_score = json_baseline + (target - json_baseline) * combined
-            scores.append(q_score)
-        return round(
-            min(100.0, max(0.0, sum(scores) / len(scores))),
-            2
-        )
     def _check_query_index_coverage(self, query: dict) -> float:
         """
-        Returns 0.0-1.0: how well indexes cover this query's WHERE clause.
-        0.0 = full table scan, 0.9 = composite index match.
         """
         sql = query.get("sql", "").lower()
         for table, indexes in self.indexes.items():
@@ -253,42 +234,59 @@ class DatabaseSimulator:
                 continue
             for idx in indexes:
                 if idx == "PRIMARY":
                     if "where id=" in sql or "where id =" in sql:
                         return 0.95
                     continue
-                cols    = idx.replace("idx_", "").split("_")
                 matches = sum(1 for c in cols if c in sql)
                 if matches >= 2:
-                    return 0.90   # Composite — excellent
                 if matches == 1:
-                    return 0.60   # Single column — partial
         return 0.0
-    def _queries_benefiting_from_index(
-        self, table: str, cols: list
-    ) -> list:
-        return [
-            q["id"] for q in self.queries
-            if table in q.get("sql", "").lower()
-            and any(c.lower() in q.get("sql", "").lower() for c in cols)
-        ]
     def _estimate_rewrite(self, new_sql: str, query: dict) -> float:
         new_lower = new_sql.lower()
         old_lower = query.get("sql", "").lower()
         improvement = 0.0
         if "select *" not in new_lower and "select *" in old_lower:
             improvement += 0.20
         if "limit " in new_lower and "limit " not in old_lower:
             improvement += 0.15
         if "exists" in new_lower and "in (select" in old_lower:
             improvement += 0.25
-        if ("inner join" in new_lower
-                and "," in old_lower
-                and "join" not in old_lower):
             improvement += 0.30
         if "where" in new_lower and "where" not in old_lower:
             improvement += 0.35
         if "coalesce" in new_lower:
             improvement += 0.05
         return min(improvement, 0.70)

 """
 env/db_simulator.py — SQL Database Engineer Agent
 Simulates a production database responding to optimization actions.
+Core mechanism: index coverage reduces query execution time by up to 85-90%.
 """
 import math
+import random
 from typing import Optional
 class DatabaseSimulator:
+    """
+    Simulates a production database that degrades over time.
+    The agent applies optimization actions and sees performance scores change.
+    Performance score: 0-100 (100 = all queries running at target speed).
+    The agent's goal: get performance_score >= target_score.
+    """
     def __init__(self, scenario: dict):
         self.scenario     = scenario
         self.tables       = {t["name"]: dict(t) for t in scenario["tables"]}
         }
         self.stats_fresh  = {name: False for name in self.tables}
         self.partitioned  = {name: False for name in self.tables}
         self.baseline     = self._compute_score()
         self.history      = [self.baseline]
         self.best_score   = self.baseline
     # ─────────────────────────────────────────────
     def apply_action(self, action_type: str, payload: dict) -> dict:
+        """
+        Apply an optimization action to the database.
+        Returns delta showing performance change.
+        """
         old_score = self._compute_score()
         affected  = []
                 self.indexes[table].append(idx_name)
                 affected = self._queries_benefiting_from_index(table, cols)
             else:
+                # Duplicate index — no benefit
                 return {
                     "old_score": old_score, "new_score": old_score,
                     "delta": 0.0, "affected_queries": [],
+                    "improved": False, "message": "Index already exists or table not found."
                 }
         elif action_type == "rewrite_query":
             table = payload.get("table", "")
             if table in self.tables and not self.partitioned.get(table):
                 self.partitioned[table] = True
+                affected = [q["id"] for q in self.queries if table in q.get("sql", "")]
         elif action_type == "analyze_statistics":
             table = payload.get("table", "")
             if table in self.tables:
                 self.stats_fresh[table] = True
+                affected = [q["id"] for q in self.queries if table in q.get("sql", "")]
         elif action_type == "drop_index":
             table    = payload.get("table", "")
             idx_name = payload.get("index_name", "")
+            if idx_name in self.indexes.get(table, []) and idx_name != "PRIMARY":
                 self.indexes[table].remove(idx_name)
         elif action_type == "add_column":
+            table   = payload.get("table", "")
+            col     = payload.get("column", "")
+            purpose = payload.get("purpose", "")
             if table in self.tables:
                 if "extra_columns" not in self.tables[table]:
                     self.tables[table]["extra_columns"] = []
                 self.tables[table]["extra_columns"].append(col)
+                # Denormalization can help JOINy queries
                 affected = [
                     q["id"] for q in self.queries
+                    if "join" in q.get("sql", "").lower() and table in q.get("sql", "")
                 ]
         new_score = self._compute_score()
         }
     def inspect_query(self, query_id: str) -> dict:
+        """
+        EXPLAIN a slow query — reveals scan type, rows examined, cost.
+        This is the agent's primary investigation tool.
+        """
         for q in self.queries:
             if q["id"] == query_id:
+                has_index    = self._check_query_index_coverage(q) > 0.1
+                is_partition = self.partitioned.get(q.get("main_table", ""), False)
+                rows_examined = 50 if has_index else q.get("rows_examined",
+                    self.tables.get(q.get("main_table", ""), {}).get("rows", 50000))
                 return {
+                    "query_id":         query_id,
+                    "sql":              q["sql"],
+                    "avg_ms":           q["avg_ms"],
+                    "scan_type":        "INDEX RANGE SCAN" if has_index else "FULL TABLE SCAN",
+                    "rows_examined":    rows_examined,
+                    "partitioned":      is_partition,
                     "optimization_hint": (
                         "Query is using index efficiently."
+                        if has_index
+                        else "No index covering WHERE columns. Consider adding composite index."
                     ),
+                    "main_table":       q.get("main_table", "unknown"),
                 }
         return {"error": f"Query '{query_id}' not found"}
     def analyze_indexes(self, table: str) -> dict:
+        """
+        Show all indexes on a table + usage stats + missing index hints.
+        """
         if table not in self.tables:
             return {"error": f"Table '{table}' not found"}
+        existing   = self.indexes.get(table, [])
+        hints      = [
             h for h in self.scenario.get("missing_index_hints", [])
             if h.get("table") == table
         ]
+        used_by    = []
         for q in self.queries:
             cov = self._check_query_index_coverage(q)
             if table in q.get("sql", "") and cov > 0.1:
                 used_by.append(q["id"])
         return {
+            "table":           table,
+            "row_count":       self.tables[table].get("rows", 0),
             "existing_indexes": existing,
+            "indexes_used_by": used_by,
+            "missing_hints":   hints,
+            "stats_fresh":     self.stats_fresh.get(table, False),
+            "partitioned":     self.partitioned.get(table, False),
+            "size_mb":         self.tables[table].get("size_mb", 0),
         }
     # ─────────────────────────────────────────────
     # ─────────────────────────────────────────────
     def get_current_state(self) -> dict:
+        """Returns the full current DB state for the Observation."""
         return {
             "performance_score": round(self._compute_score(), 2),
             "baseline_score":    round(self.baseline, 2),
         return self._compute_score() >= self.target_score
     # ─────────────────────────────────────────────
+    #  INTERNAL SCORING ENGINE
     # ─────────────────────────────────────────────
     def _compute_score(self) -> float:
         """
+        Core scoring: calculates performance score 0-100.
+        Higher = better. Based on how fast queries run given current indexes.
         """
         if not self.queries:
             return 0.0
         scores = []
         for q in self.queries:
+            table       = q.get("main_table", "")
             coverage    = self._check_query_index_coverage(q)
+            part_bonus  = 0.30 if self.partitioned.get(table, False) else 0.0
+            stats_bonus = 0.05 if self.stats_fresh.get(table, False) else 0.0
+            total_reduction = min(coverage * 0.85 + part_bonus + stats_bonus, 0.97)
+            effective_ms    = q["avg_ms"] * (1 - total_reduction)
+            # Score formula: 100ms = score 99, 1000ms = score 90, 8500ms = ~14
+            score = max(0.0, 100.0 - (effective_ms / 100.0))
+            scores.append(score)
+        return round(sum(scores) / len(scores), 2)
     def _check_query_index_coverage(self, query: dict) -> float:
         """
+        Returns 0.0-1.0 representing how well indexes cover this query's WHERE clause.
+        0.0 = full table scan, 1.0 = perfect index coverage.
         """
         sql = query.get("sql", "").lower()
         for table, indexes in self.indexes.items():
                 continue
             for idx in indexes:
                 if idx == "PRIMARY":
+                    # Primary key only helps if query filters by primary key
                     if "where id=" in sql or "where id =" in sql:
                         return 0.95
                     continue
+                # Extract columns from index name (idx_col1_col2)
+                cols = idx.replace("idx_", "").split("_")
                 matches = sum(1 for c in cols if c in sql)
                 if matches >= 2:
+                    return 0.90  # Composite index — excellent coverage
                 if matches == 1:
+                    return 0.60  # Single column — partial coverage
         return 0.0
+    def _queries_benefiting_from_index(self, table: str, cols: list) -> list:
+        """Returns query IDs that would benefit from an index on given table/columns."""
+        benefiting = []
+        for q in self.queries:
+            sql = q.get("sql", "").lower()
+            if table in sql and any(c.lower() in sql for c in cols):
+                benefiting.append(q["id"])
+        return benefiting
     def _estimate_rewrite(self, new_sql: str, query: dict) -> float:
+        """
+        Estimates improvement factor from a query rewrite (0.0 to 0.70).
+        Checks for common optimization patterns.
+        """
         new_lower = new_sql.lower()
         old_lower = query.get("sql", "").lower()
         improvement = 0.0
+        # Remove SELECT * → specific columns
         if "select *" not in new_lower and "select *" in old_lower:
             improvement += 0.20
+        # Add LIMIT clause
         if "limit " in new_lower and "limit " not in old_lower:
             improvement += 0.15
+        # Use EXISTS instead of IN subquery
         if "exists" in new_lower and "in (select" in old_lower:
             improvement += 0.25
+        # Use INNER JOIN instead of implicit cross join
+        if "inner join" in new_lower and "," in old_lower and "join" not in old_lower:
             improvement += 0.30
+        # Add WHERE clause that was missing
         if "where" in new_lower and "where" not in old_lower:
             improvement += 0.35
+        # Use COALESCE / ISNULL
         if "coalesce" in new_lower:
             improvement += 0.05
         return min(improvement, 0.70)