Spaces:

laterabhi
/

SQL-Query-Env

Paused

Abhinav Singh commited on Apr 8

Commit

7841be7

1 Parent(s): c540c55

feat(core): add grader and SQLOptimEnv environment class

graders.py — 6-component composite reward function:
1. Issue Detection 60% keyword-match against ground truth issues
2. Optimized Query 15% length + anti-pattern removal heuristics
3. Approval Correct 10% bool match vs. approved_expected
4. Summary Quality 8% progressive scoring on summary length
5. Improvement Est. 4% keyword-match on estimated_improvement field
6. Severity Labels 3% checks severity values are present
Minimum reward of 0.02 for any non-empty submission (partial signal)

env.py — SQLOptimEnv class:
- reset(task_id): validates task, initialises episode state, returns Observation
- step(action): grades action, tracks issues_found_so_far, returns StepResult
- state(): returns EnvironmentState snapshot without advancing episode
- Episode terminates on max_steps OR reward >= 0.95 (early exit)

Files changed (2) hide show

env.py +109 -0
graders.py +126 -0

env.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from typing import Optional
+from models import Observation, Action, Reward, StepResult, EnvironmentState
+from tasks import TASKS
+from graders import grade
+class SQLOptimEnv:
+    """
+    OpenEnv-compliant environment for SQL Query Optimization.
+    An AI agent iteratively analyzes a SQL query, identifies performance issues,
+    and submits optimized rewrites. The environment grades each action and tracks
+    progress across multiple steps within an episode.
+    """
+    def __init__(self):
+        self._task_data: Optional[dict] = None
+        self._step_count: int = 0
+        self._done: bool = False
+        self._cumulative_reward: float = 0.0
+        self._issues_found: list = []
+    def reset(self, task_id: str = "task_1_basic_antipatterns") -> Observation:
+        """Start a new episode for the given task."""
+        if task_id not in TASKS:
+            raise ValueError(
+                f"Unknown task_id '{task_id}'. "
+                f"Valid tasks: {list(TASKS.keys())}"
+            )
+        self._task_data = TASKS[task_id]
+        self._step_count = 0
+        self._done = False
+        self._cumulative_reward = 0.0
+        self._issues_found = []
+        return self._make_observation()
+    def step(self, action: Action) -> StepResult:
+        """Process one agent action and return (observation, reward, done, info)."""
+        if self._task_data is None:
+            raise RuntimeError("Episode not started. Call reset() first.")
+        if self._done:
+            raise RuntimeError("Episode already finished. Call reset() to start a new episode.")
+        self._step_count += 1
+        # Grade the action
+        reward: Reward = grade(self._task_data, action)
+        self._cumulative_reward += reward.score
+        # Track issue types found so far
+        for s in action.suggestions:
+            issue_type = s.get("issue_type", "")
+            if issue_type and issue_type not in self._issues_found:
+                self._issues_found.append(issue_type)
+        # Episode ends when max_steps reached OR agent finds a perfect score
+        max_steps = self._task_data["max_steps"]
+        done = self._step_count >= max_steps or reward.score >= 0.95
+        self._done = done
+        obs = self._make_observation()
+        return StepResult(
+            observation=obs,
+            reward=reward,
+            done=done,
+            info={
+                "step": self._step_count,
+                "cumulative_reward": round(self._cumulative_reward, 4),
+                "issues_found_count": len(self._issues_found),
+            }
+        )
+    def state(self) -> EnvironmentState:
+        """Return current environment state (for /state endpoint)."""
+        if self._task_data is None:
+            return EnvironmentState(
+                task_id="none",
+                step_count=0,
+                max_steps=0,
+                episode_done=True,
+                cumulative_reward=0.0,
+                current_task="No active episode"
+            )
+        return EnvironmentState(
+            task_id=self._task_data["task_id"],
+            step_count=self._step_count,
+            max_steps=self._task_data["max_steps"],
+            episode_done=self._done,
+            cumulative_reward=round(self._cumulative_reward, 4),
+            current_task=self._task_data["task_name"],
+        )
+    def _make_observation(self) -> Observation:
+        d = self._task_data
+        return Observation(
+            task_id=d["task_id"],
+            task_name=d["task_name"],
+            task_description=d["task_description"],
+            sql_query=d["sql_query"],
+            schema_info=d["schema_info"],
+            dialect=d.get("dialect", "postgresql"),
+            difficulty=d["difficulty"],
+            step_count=self._step_count,
+            max_steps=d["max_steps"],
+            issues_found_so_far=list(self._issues_found),
+        )

graders.py ADDED Viewed

	@@ -0,0 +1,126 @@

+from typing import Dict, Any, List
+from models import Action, Reward
+def _keyword_match(text: str, keywords: List[str]) -> bool:
+    """Check if any keyword appears in text (case-insensitive)."""
+    text_lower = text.lower()
+    return any(kw.lower() in text_lower for kw in keywords)
+def _suggestions_text(action: Action) -> str:
+    """Flatten all suggestion fields into one searchable string."""
+    parts = [action.summary, action.optimized_query, action.estimated_improvement]
+    for s in action.suggestions:
+        parts.append(str(s.get("issue_type", "")))
+        parts.append(str(s.get("description", "")))
+        parts.append(str(s.get("fix", "")))
+        parts.append(str(s.get("line", "")))
+        parts.append(str(s.get("severity", "")))
+    return " ".join(parts)
+def grade(task_data: Dict[str, Any], action: Action) -> Reward:
+    """
+    Grade an agent's SQL optimization action against ground truth issues.
+    Scoring breakdown:
+      - Issue Detection:         60%  (did agent find the right problems?)
+      - Optimized Query Quality: 15%  (did agent provide a meaningful rewrite?)
+      - Approval Correctness:    10%  (correctly flagged as needing changes?)
+      - Summary Quality:          8%  (is the summary thorough and informative?)
+      - Improvement Estimate:     4%  (did agent quantify the expected gain?)
+      - Severity Labels:          3%  (are severity levels present?)
+    """
+    ground_truth: List[Dict[str, Any]] = task_data["ground_truth_issues"]
+    full_text = _suggestions_text(action)
+    # ── 1. Issue Detection Score (0.0–0.60) ────────────────────────────
+    detected = 0
+    detection_feedback = []
+    for gt_issue in ground_truth:
+        found = _keyword_match(full_text, gt_issue["keywords"])
+        if found:
+            detected += 1
+            detection_feedback.append(f"✅ Found: {gt_issue['type']} (line ~{gt_issue['line']})")
+        else:
+            detection_feedback.append(f"❌ Missed: {gt_issue['type']} (line ~{gt_issue['line']})")
+    detection_score = (detected / len(ground_truth)) * 0.60
+    # ── 2. Optimized Query Quality (0.0–0.15) ──────────────────────────
+    query_score = 0.0
+    oq = action.optimized_query.strip()
+    if len(oq) > 50:
+        query_score = 0.05
+    if len(oq) > 150:
+        query_score = 0.10
+    # Bonus if the rewrite removes obvious anti-patterns found in original
+    original_query = task_data["sql_query"].lower()
+    if "select *" in original_query and "select *" not in oq.lower():
+        query_score = min(query_score + 0.03, 0.15)
+    if query_score < 0.15 and len(action.suggestions) > 0 and len(oq) > 100:
+        query_score = min(query_score + 0.02, 0.15)
+    query_score = min(query_score, 0.15)
+    # ── 3. Approval Correctness (0.0–0.10) ─────────────────────────────
+    expected_approved = task_data.get("approved_expected", False)
+    approval_score = 0.10 if action.approved == expected_approved else 0.0
+    # ── 4. Summary Quality (0.0–0.08) ──────────────────────────────────
+    summary_score = 0.0
+    if len(action.summary) > 40:
+        summary_score = 0.04
+    if len(action.summary) > 100:
+        summary_score = 0.08
+    # ── 5. Improvement Estimate Present (0.0–0.04) ─────────────────────
+    improvement_keywords = ["x faster", "% less", "% faster", "% improvement", "times", "reduce", "improvement", "speedup"]
+    has_estimate = _keyword_match(action.estimated_improvement, improvement_keywords) and len(action.estimated_improvement) > 5
+    improvement_score = 0.04 if has_estimate else 0.0
+    # ── 6. Severity Labels Present (0.0–0.03) ──────────────────────────
+    severity_keywords = ["critical", "high", "medium", "low"]
+    has_severity = any(
+        _keyword_match(str(s.get("severity", "")), severity_keywords)
+        for s in action.suggestions
+    )
+    severity_score = 0.03 if has_severity else 0.0
+    # ── Final Score ─────────────────────────────────────────────────────
+    total = (
+        detection_score + query_score + approval_score +
+        summary_score + improvement_score + severity_score
+    )
+    total = round(min(max(total, 0.0), 1.0), 4)
+    # Minimum signal for any submission
+    if total == 0.0 and len(action.suggestions) > 0:
+        total = 0.02
+    breakdown = {
+        "issue_detection":       round(detection_score, 4),
+        "optimized_query":       round(query_score, 4),
+        "approval_correctness":  round(approval_score, 4),
+        "summary_quality":       round(summary_score, 4),
+        "improvement_estimate":  round(improvement_score, 4),
+        "severity_labels":       round(severity_score, 4),
+    }
+    n_suggestions = len(action.suggestions)
+    expected_n = len(ground_truth)
+    feedback_lines = detection_feedback + [
+        f"\nSuggestions submitted: {n_suggestions} (expected ~{expected_n})",
+        f"Optimized query length: {len(oq)} chars",
+        f"Approval correctness: {'✅' if action.approved == expected_approved else '❌'} "
+        f"(you said {'approved' if action.approved else 'needs changes'}, "
+        f"expected {'approved' if expected_approved else 'needs changes'})",
+        f"Total score: {total:.4f}",
+    ]
+    return Reward(
+        score=total,
+        breakdown=breakdown,
+        feedback="\n".join(feedback_lines)
+    )