Spaces:

Prithvigg
/

queryforge

Sleeping

App Files Files Community

queryforge / judge.py

Prithvigg

Upload folder using huggingface_hub

a2264ab verified 3 months ago

Raw

History Blame Contribute Delete

18.7 kB

	"""
	QueryForge Judge — deterministic DuckDB grading + Anthropic AI quality scoring.

	Grading pipeline for each submitted SQL query:

	Stage 1 — Syntax (0.0 → 0.15)
	DuckDB EXPLAIN parses the query. Fail → score = 0.0.

	Stage 2 — Execution (→ 0.30)
	Run the full query against in-memory DuckDB seeded with task data.
	Fail → score = 0.15 (syntax was fine, runtime error).

	Stage 3 — Correctness (→ 0.80)
	Compare returned rows against expected rows.
	Perfect match → deterministic score reaches 0.80.
	Partial credit for correct row count or partial row matches.

	Stage 4 — AI Quality (→ 1.0)
	Anthropic claude-haiku-4-5 evaluates optimization, code style, and
	semantic correctness vs. the reference solution.
	The AI score can move the final score up to 1.0 when rows are correct,
	or provide nuanced feedback even when rows are partially wrong.

	Environment variable required:
	ANTHROPIC_API_KEY — standard Anthropic SDK key.
	"""

	import json
	import re
	from typing import Any, Dict, List, Optional, Tuple

	import anthropic
	import duckdb

	try:
	from .tasks import SQLTask, TestCase
	except ImportError:
	from tasks import SQLTask, TestCase

	JUDGE_MODEL = "claude-haiku-4-5-20251001"
	# ---------------------------------------------------------------------------
	# Stage 1 — Syntax check
	# ---------------------------------------------------------------------------

	def _reject_multi_statement(query: str) -> Optional[str]:
	"""Return an error message if the query contains multiple statements."""
	# Strip string literals and comments before checking for semicolons
	stripped = re.sub(r"'[^']*'", "", query) # remove string literals
	stripped = re.sub(r"--[^\n]*", "", stripped) # remove line comments
	stripped = re.sub(r"/\.?\*/", "", stripped, flags=re.DOTALL) # block comments
	stripped = stripped.strip().rstrip(";") # allow a single trailing semicolon
	if ";" in stripped:
	return "Multi-statement queries are not allowed."
	return None


	def check_syntax(query: str) -> Tuple[bool, Optional[str]]:
	"""
	Return (is_valid, error_message).

	Strategy: run EXPLAIN against an empty in-memory DuckDB.
	- "Parser Error" in the exception → genuine syntax error → invalid.
	- "Catalog Error" / "Binder Error" → tables unknown but syntax is fine → valid.
	- Any other exception → treat as syntax error to be safe.
	"""
	multi_err = _reject_multi_statement(query)
	if multi_err:
	return False, multi_err

	conn = duckdb.connect(":memory:")
	try:
	conn.execute(f"EXPLAIN {query}")
	return True, None
	except Exception as exc:
	msg = str(exc)
	# Catalog/Binder errors mean the SQL parsed fine; tables just aren't seeded.
	if any(
	tag in msg
	for tag in ("Catalog Error", "Binder Error", "Table with name",
	"Referenced column", "does not exist", "column")
	):
	return True, None
	return False, msg
	finally:
	conn.close()


	# ---------------------------------------------------------------------------
	# Stage 2 — Execution
	# ---------------------------------------------------------------------------

	def execute_query(
	schema_ddl: str, query: str
	) -> Tuple[bool, Optional[List[Dict[str, Any]]], Optional[str]]:
	"""
	Seed a fresh DuckDB in-memory DB with schema_ddl, then run query.
	Returns (success, rows_as_list_of_dicts, error_message).
	"""
	conn = duckdb.connect(":memory:")
	try:
	conn.execute(schema_ddl)
	result = conn.execute(query).fetchdf()
	rows = result.to_dict(orient="records")
	# Convert numpy types to native Python
	clean: List[Dict[str, Any]] = []
	for row in rows:
	clean.append({k: _native(v) for k, v in row.items()})
	return True, clean, None
	except Exception as exc:
	return False, None, str(exc)
	finally:
	conn.close()


	def _native(value: Any) -> Any:
	"""Convert numpy scalars → native Python types for JSON-safe comparison."""
	try:
	import numpy as np # duckdb fetchdf() returns numpy types
	if isinstance(value, (np.integer,)):
	return int(value)
	if isinstance(value, (np.floating,)):
	return float(value)
	if isinstance(value, np.bool_):
	return bool(value)
	except ImportError:
	pass
	return value


	# ---------------------------------------------------------------------------
	# Stage 3 — Row correctness
	# ---------------------------------------------------------------------------

	def _normalize(row: Dict[str, Any]) -> Dict[str, Any]:
	"""Round floats to 2 dp so 999.99000000001 == 999.99."""
	return {
	k: (round(float(v), 2) if isinstance(v, float) else v)
	for k, v in row.items()
	}


	def _sort_key(row: Dict[str, Any], order_by: Optional[str]) -> tuple:
	if order_by:
	cols = [c.strip() for c in order_by.split(",")]
	return tuple(str(row.get(c, "")) for c in cols)
	return tuple(str(v) for v in row.values())


	def rows_match(
	actual: List[Dict[str, Any]],
	expected: List[Dict[str, Any]],
	order_by: Optional[str] = None,
	) -> Tuple[float, str]:
	"""
	Compare actual vs expected rows.

	Scoring:
	1.0 — exact match
	0.5–0.9 — row count matches, some rows differ
	0.3 — row count wrong but partial overlap
	0.0 — empty when non-empty expected
	"""
	if not expected:
	return (1.0, "No expected rows — query accepted.") if not actual else (
	0.8, f"Expected empty result but got {len(actual)} row(s)."
	)

	if not actual:
	return 0.0, f"Query returned 0 rows; expected {len(expected)}."

	# Project actual rows to only the expected columns (agent may SELECT extra).
	# Use case-insensitive matching: build a map from lower(actual_col) → actual_col.
	expected_cols = list(expected[0].keys())
	lower_map = {k.lower(): k for k in actual[0].keys()} if actual else {}

	def _project(row: Dict[str, Any]) -> Dict[str, Any]:
	out: Dict[str, Any] = {}
	for ec in expected_cols:
	actual_key = lower_map.get(ec.lower())
	if actual_key is not None:
	out[ec] = row[actual_key]
	return out

	projected = [_project(row) for row in actual]

	actual_norm = [_normalize(r) for r in projected]
	expected_norm = [_normalize(r) for r in expected]

	if len(projected) != len(expected):
	# Count how many returned rows are actually in the expected set
	expected_set = [tuple(sorted(r.items())) for r in expected_norm]
	correct_rows = sum(1 for r in actual_norm if tuple(sorted(r.items())) in expected_set)
	# Score based on fraction of expected rows correctly returned
	coverage = correct_rows / len(expected)
	# Base 0.10 for count mismatch, up to 0.45 for high coverage of correct rows
	score = 0.10 + 0.35 * coverage
	return score, (
	f"Row count mismatch: got {len(projected)}, expected {len(expected)}. "
	f"{correct_rows}/{len(expected)} expected rows present."
	)

	actual_sorted = sorted(actual_norm, key=lambda r: _sort_key(r, order_by))
	expected_sorted = sorted(expected_norm, key=lambda r: _sort_key(r, order_by))

	matches = sum(1 for a, e in zip(actual_sorted, expected_sorted) if a == e)
	row_accuracy = matches / len(expected)

	if row_accuracy == 1.0:
	return 1.0, "All rows match perfectly."

	score = 0.5 + 0.4 * row_accuracy
	return score, f"{matches}/{len(expected)} rows match correctly."


	# ---------------------------------------------------------------------------
	# Stage 4 — Anthropic AI judge
	# ---------------------------------------------------------------------------

	def call_anthropic_judge(
	task: SQLTask,
	agent_query: str,
	execution_success: bool,
	execution_error: Optional[str],
	actual_rows: Optional[List[Dict[str, Any]]],
	deterministic_score: float,
	) -> Tuple[float, str, str]:
	"""
	Call claude-sonnet-4-6 to evaluate query quality across three axes:
	- Correctness (0–0.50)
	- Optimization (0–0.30) — avoids inefficiencies, uses best SQL patterns
	- Code quality (0–0.20) — readable, well-aliased, idiomatic SQL

	Returns (final_score, feedback, improvement_hint).
	Falls back to deterministic_score if the API call fails.
	"""
	client = anthropic.Anthropic()

	sample_actual = json.dumps(actual_rows[:5] if actual_rows else [], indent=2)
	sample_expected = json.dumps(
	task.test_cases[0].expected_rows if task.test_cases else [], indent=2
	)

	prompt = f"""\
	You are a strict SQL expert judge scoring an agent's query for the task below.

	## Task ({task.level})
	{task.description}

	## Agent Query
	```sql
	{agent_query}
	```

	## Execution
	- Success: {execution_success}
	- Error: {execution_error or "None"}
	- Rows returned (first 5): {sample_actual}
	- Expected rows: {sample_expected}

	## Reference Solution
	```sql
	{task.solution_query}
	```

	## Deterministic row-match score (0.0–1.0): {deterministic_score:.3f}

	Score the agent query on THREE axes and sum them for the final score:

	\| Axis \| Max \| Criteria \|
	\|--------------\|------\|----------\|
	\| Correctness \| 0.50 \| Produces the right rows for the stated goal \|
	\| Optimization \| 0.30 \| Avoids cartesian products / correlated subqueries; uses efficient patterns (CTEs, explicit JOINs, proper GROUP BY) \|
	\| Code quality \| 0.20 \| Readable aliases, clean formatting, no redundant clauses \|

	IMPORTANT rules:
	- If execution failed with a runtime error, Correctness ≤ 0.10.
	- If rows are fully correct per deterministic score ≥ 0.95, Correctness ≥ 0.40.
	- For the medium task: a query that still uses comma-join syntax scores Optimization ≤ 0.05.
	- For the hard task: a query without a CTE scores Optimization ≤ 0.10.

	Respond with ONLY valid JSON (no markdown fences):
	{{
	"correctness": <float 0.0–0.50>,
	"optimization": <float 0.0–0.30>,
	"code_quality": <float 0.0–0.20>,
	"score": <sum of above, float 0.0–1.0>,
	"feedback": "<2–3 sentences summarising what the agent did right/wrong>",
	"hint": "<one concrete actionable improvement, or 'Excellent!' if score >= 0.95>"
	}}"""

	try:
	message = client.messages.create(
	model=JUDGE_MODEL,
	max_tokens=512,
	messages=[
	{"role": "user", "content": prompt},
	{"role": "assistant", "content": "{"}, # prefill forces JSON-only reply
	],
	)
	# Prepend the prefilled "{" back before parsing
	raw = "{" + message.content[0].text.strip()

	# Belt-and-suspenders: extract the first {...} block in case of any preamble
	brace_start = raw.find("{")
	brace_end = raw.rfind("}") + 1
	if brace_start != -1 and brace_end > brace_start:
	raw = raw[brace_start:brace_end]

	data = json.loads(raw)
	score = float(data["score"])
	score = max(0.0, min(1.0, score))
	feedback = str(data.get("feedback", ""))
	hint = str(data.get("hint", ""))
	return score, feedback, hint

	except Exception as exc:
	# Graceful fallback — no API key, network error, or parse failure
	msg = str(exc).lower()
	if "api_key" in msg or "auth" in msg or "authentication" in msg:
	reason = "ANTHROPIC_API_KEY not set — deterministic scoring only (max 0.80)"
	else:
	reason = f"AI judge call failed ({type(exc).__name__}) — fell back to deterministic score"
	return (
	deterministic_score,
	f"[AI Judge unavailable] {reason}.",
	task.hint,
	)


	# ---------------------------------------------------------------------------
	# Public entry point
	# ---------------------------------------------------------------------------

	def grade(
	task: SQLTask, agent_query: str
	) -> Tuple[float, str, Dict[str, Any]]:
	"""
	Full grading pipeline. Returns (score 0.0–1.0, feedback, details_dict).

	Partial progress scoring:
	0.00 — syntax error (unparseable)
	0.15 — syntax valid, runtime error
	0.30 — executes, but 0 rows returned
	0.30–0.80 — partial row matches (deterministic)
	0.80–1.00 — correct rows + AI quality assessment
	"""
	details: Dict[str, Any] = {}

	# ── Stage 1: syntax ──────────────────────────────────────────────────────
	syntax_ok, syntax_error = check_syntax(agent_query)
	details["syntax_valid"] = syntax_ok
	details["syntax_error"] = syntax_error

	if not syntax_ok:
	return 0.001, f"Syntax error: {syntax_error}", details

	# ── Stage 2: execution ───────────────────────────────────────────────────
	exec_ok, rows, exec_error = execute_query(task.schema_ddl, agent_query)
	details["execution_success"] = exec_ok
	details["execution_error"] = exec_error
	details["rows_returned"] = len(rows) if rows else 0

	if not exec_ok:
	# Syntax valid but runtime error — call AI for nuanced feedback
	ai_score, ai_feedback, ai_hint = call_anthropic_judge(
	task, agent_query, False, exec_error, None, 0.15
	)
	details["ai_score"] = ai_score
	details["ai_feedback"] = ai_feedback
	final = max(0.15, ai_score * 0.3) # cap at 0.3 when execution fails
	return final, f"Runtime error: {exec_error} \| AI: {ai_feedback}", details

	# ── Stage 3: row correctness ─────────────────────────────────────────────
	test_case = task.test_cases[0]
	row_score, row_feedback = rows_match(rows, test_case.expected_rows, test_case.order_by)
	details["row_match_score"] = row_score
	details["row_match_feedback"] = row_feedback

	# ── Stage 3b: structural checks (task-specific) ─────────────────────────
	# These prevent high scores when the agent submits the broken query verbatim
	# or ignores the task's structural requirement.
	structural_penalty = 0.0
	query_upper = agent_query.upper()

	if task.level == "hard" and "WITH " not in query_upper:
	structural_penalty = 0.30 # hard task demands a CTE
	row_feedback += " (Penalty: no CTE detected — task requires WITH clause.)"
	elif task.level == "medium" and "JOIN " not in query_upper:
	structural_penalty = 0.20 # medium task demands explicit JOINs
	row_feedback += " (Penalty: no explicit JOIN — task requires JOIN … ON syntax.)"
	elif task.id == "task_expert_recursive":
	# Two bugs: anchor uses WHERE id=3 (includes VP Eng) + non-recursive CTE (misses deep levels)
	if "RECURSIVE" not in query_upper:
	structural_penalty += 0.30
	row_feedback += " (Penalty: WITH RECURSIVE required — hardcoded levels won't scale.)"
	if "MANAGER_ID = 3" not in query_upper and "MANAGER_ID=3" not in query_upper:
	structural_penalty += 0.15
	row_feedback += " (Penalty: anchor should select subordinates via manager_id, not the VP themselves.)"
	structural_penalty = min(structural_penalty, 0.40)
	elif task.id == "task_expert_rank":
	# Two bugs: ROW_NUMBER (drops ties) + ASC ordering (picks lowest instead of highest)
	if "ROW_NUMBER" in query_upper:
	structural_penalty += 0.20
	row_feedback += " (Penalty: ROW_NUMBER() drops tied rows — use RANK() or DENSE_RANK().)"
	if "ASC" in query_upper and "DESC" not in query_upper:
	structural_penalty += 0.15
	row_feedback += " (Penalty: ordering by revenue ASC picks lowest earners, not highest.)"
	structural_penalty = min(structural_penalty, 0.35)
	elif task.id == "task_expert_window":
	# Three bugs: missing PARTITION BY on both windows + tied revenues need correct ranking
	if "PARTITION BY" not in query_upper:
	structural_penalty += 0.20
	row_feedback += " (Penalty: missing PARTITION BY — both SUM and RANK must be partitioned per region.)"
	# Count PARTITION BY occurrences — need at least 2 (one per window function)
	partition_count = query_upper.count("PARTITION BY")
	if 0 < partition_count < 2:
	structural_penalty += 0.10
	row_feedback += " (Penalty: only one window function has PARTITION BY — both need it.)"
	structural_penalty = min(structural_penalty, 0.30)

	details["structural_penalty"] = structural_penalty

	# Deterministic score: 0.30 base for executing + up to 0.50 for rows − penalty
	deterministic_score = max(0.30, 0.30 + 0.50 * row_score - structural_penalty)

	# ── Stage 4: AI quality ──────────────────────────────────────────────────
	ai_score, ai_feedback, ai_hint = call_anthropic_judge(
	task, agent_query, True, None, rows, deterministic_score
	)
	details["ai_score"] = ai_score
	details["ai_feedback"] = ai_feedback
	details["ai_hint"] = ai_hint

	# Final blending:
	# AI judge offline (fallback) → use deterministic score directly
	# rows fully correct → trust AI score (can reach 1.0)
	# rows partially wrong → clamp AI score to not exceed deterministic
	ai_is_fallback = abs(ai_score - deterministic_score) < 0.001
	if ai_is_fallback:
	# AI judge was unavailable — use deterministic score as-is
	final_score = deterministic_score
	elif row_score >= 0.95:
	final_score = ai_score
	elif row_score >= 0.5:
	# Blend: AI provides nuance but can't exceed deterministic ceiling
	final_score = min(deterministic_score, ai_score + 0.05)
	else:
	# Low row accuracy — stay near deterministic
	final_score = min(deterministic_score, ai_score * 0.6)

	final_score = max(0.001, min(0.999, final_score))

	feedback = (
	f"[Rows] {row_feedback} "
	f"[AI Judge] {ai_feedback} "
	f"[Hint] {ai_hint}"
	)
	return final_score, feedback, details