Spaces:

Dolphin-Syndrom
/

code-review-env

Sleeping

App Files Files Community

code-review-env / inference.py

theaniketgiri

Optimize for Phase 2: 5 tasks, severity scoring, iterative refinement, 32 tests

0bbb422 about 2 months ago

raw

history blame contribute delete

21.5 kB

	"""
	inference.py
	============
	Baseline inference script for the Code Review Environment.

	MANDATORY STDOUT FORMAT
	-----------------------
	[START] task=<task_name> env=<benchmark> model=<model_name>
	[STEP] step=<n> action=<action_str> reward=<0.00> done=<true\|false> error=<msg\|null>
	[END] success=<true\|false> steps=<n> score=<score> rewards=<r1,r2,...,rn>

	Rules:
	- One [START] line at episode begin.
	- One [STEP] line per step, immediately after env.step() returns.
	- One [END] line after the episode ends (always emitted, even on exception).
	- reward and rewards formatted to 2 decimal places.
	- done and success are lowercase booleans: true or false.
	- error is the raw step exception string, or null if none.
	- All fields on a single line with no newlines within a line.

	Required environment variables:
	API_BASE_URL - Proxy endpoint for LLM calls.
	MODEL_NAME - Model identifier for inference.
	HF_TOKEN - Hugging Face / API key.

	Usage:
	python inference.py
	ENV_SERVER_URL=http://localhost:8000 python inference.py
	"""

	import json
	import os
	import re
	import sys
	import textwrap
	import time
	from collections.abc import Callable
	from typing import Any, Optional

	import urllib.request
	import urllib.error

	# ---------------------------------------------------------------------------
	# Configuration — fully environment-driven
	# ---------------------------------------------------------------------------

	API_BASE_URL: str = os.environ.get("API_BASE_URL", "https://router.huggingface.co/v1")
	API_KEY: str = (
	os.environ.get("API_KEY")
	or os.environ.get("HF_TOKEN")
	or os.environ.get("OPENAI_API_KEY")
	or "missing-api-key"
	)
	MODEL_NAME: str = os.environ.get("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
	ENV_SERVER_URL: str = os.environ.get("ENV_SERVER_URL", "http://localhost:8000")

	BENCHMARK = "code_review_env"
	TASKS = ["task_extra_easy", "task_easy", "task_medium", "task_hard", "task_expert"]
	MAX_STEPS = 3
	TEMPERATURE = 0.0
	MAX_TOKENS = 1024
	SUCCESS_THRESHOLDS = {
	"task_extra_easy": 0.95,
	"task_easy": 0.95,
	"task_medium": 0.95,
	"task_hard": 0.95,
	"task_expert": 0.95,
	}

	ISSUE_TAXONOMY = [
	"null_pointer",
	"missing_return",
	"type_error",
	"index_out_of_bounds",
	"sql_injection",
	"hardcoded_secret",
	"missing_input_validation",
	"race_condition",
	"timing_attack",
	"improper_error_handling",
	"integer_overflow",
	"path_traversal",
	]

	# Expanded detection rules covering all 12 taxonomy items
	DETECTION_RULES: dict[str, Callable[[str], bool]] = {
	"null_pointer": lambda code: ".get(" in code or "= None" in code,
	"missing_return": lambda code: "# todo: return" in code.lower(),
	"sql_injection": lambda code: (
	"f\"select" in code.lower()
	or "f'select" in code.lower()
	or "username='{" in code
	),
	"hardcoded_secret": lambda code: (
	"secret_key =" in code.lower() or '= "supersecret' in code.lower()
	),
	"race_condition": lambda code: "balance -=" in code or "balance +=" in code,
	"timing_attack": lambda code: "if expected ==" in code or "== actual" in code,
	"improper_error_handling": lambda code: "except:\n" in code or "except:\r\n" in code,
	"index_out_of_bounds": lambda code: "len(" in code and ("[" in code or "range(" in code),
	"type_error": lambda code: "int(" in code and "str" in code.lower(),
	"integer_overflow": lambda code: "2 ** 31" in code or "overflow" in code.lower(),
	"path_traversal": lambda code: "os.path.join" in code and "user" in code.lower(),
	"missing_input_validation": lambda code: (
	"open(" in code and "user" in code.lower() and "valid" not in code.lower()
	),
	}

	# Map difficulty → expected severity for rule-based fallback
	DIFFICULTY_SEVERITY: dict[str, str] = {
	"extra_easy": "low",
	"easy": "medium",
	"medium": "high",
	"hard": "critical",
	"expert": "critical",
	}

	SYSTEM_PROMPT = textwrap.dedent(
	"""
	You are a senior Python code reviewer performing a security and correctness audit.

	Your task: Identify ALL security vulnerabilities, logic errors, and code smells in the
	provided code snippet. Use ONLY the allowed taxonomy tags.

	Return ONLY a valid JSON object with these keys:
	- issues_found: array of issue tags from the allowed taxonomy (be comprehensive)
	- review_comment: detailed explanation of each identified issue with specific line references
	- severity: one of low\|medium\|high\|critical (based on worst-case impact)

	Important rules:
	- Do NOT hallucinate issues that aren't present — false positives are heavily penalized (-0.10 each)
	- DO identify every real issue — each correctly found issue earns significant reward
	- Include relevant keywords in your review_comment for quality bonus scoring
	- Match severity to the overall risk level of the issues found

	Example for a SQL injection + hardcoded secret:
	{
	"issues_found": ["sql_injection", "hardcoded_secret"],
	"review_comment": "SQL injection via f-string query interpolation allows attackers to bypass auth. The SECRET_KEY is hardcoded as plaintext instead of using environment variables.",
	"severity": "high"
	}

	Do not include markdown, code fences, or extra prose outside the JSON.
	"""
	).strip()


	# ---------------------------------------------------------------------------
	# Score clamping
	# ---------------------------------------------------------------------------


	def clamp_val(v: float, low: float = 0.01, high: float = 0.99) -> float:
	"""Clamp value to (0, 1) exclusive range."""
	return max(low, min(high, v))


	# ---------------------------------------------------------------------------
	# Mandatory stdout log helpers
	# ---------------------------------------------------------------------------


	def log_start(task: str, env: str, model: str) -> None:
	print(f"[START] task={task} env={env} model={model}", flush=True)


	def log_step(
	step: int,
	action: str,
	reward: float,
	done: bool,
	error: Optional[str],
	) -> None:
	action_clean = action.replace("\n", " ").replace("\r", " ").strip()
	error_val = error if error else "null"
	done_val = str(done).lower()
	print(
	f"[STEP] step={step} action={action_clean!r} "
	f"reward={clamp_val(reward):.2f} done={done_val} error={error_val}",
	flush=True,
	)


	def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
	rewards_str = ",".join(f"{clamp_val(r):.2f}" for r in rewards)
	success_val = str(success).lower()
	print(
	f"[END] success={success_val} steps={steps} score={clamp_val(score):.3f} rewards={rewards_str}",
	flush=True,
	)


	# ---------------------------------------------------------------------------
	# Environment HTTP helpers
	# ---------------------------------------------------------------------------


	def _post_json(url: str, payload: dict) -> dict[str, Any]:
	data = json.dumps(payload).encode("utf-8")
	req = urllib.request.Request(
	url, data=data, headers={"Content-Type": "application/json"}, method="POST"
	)
	try:
	with urllib.request.urlopen(req, timeout=30) as f:
	return json.loads(f.read().decode("utf-8"))
	except urllib.error.HTTPError as e:
	raise RuntimeError(f"HTTP {e.code}: {e.read().decode('utf-8')}")


	def env_reset(task_id: str) -> dict[str, Any]:
	return _post_json(f"{ENV_SERVER_URL}/reset", {"task_id": task_id})


	def env_step(action: dict[str, Any]) -> dict[str, Any]:
	return _post_json(f"{ENV_SERVER_URL}/step", action)


	def unwrap_step_payload(payload: dict[str, Any]) -> tuple[dict[str, Any], float, bool]:
	"""Normalize payloads that may be wrapped as {observation,reward,done} or flat."""
	if isinstance(payload.get("observation"), dict):
	observation = payload["observation"]
	reward = float(payload.get("reward", observation.get("reward", 0.0)) or 0.0)
	done = bool(payload.get("done", observation.get("done", False)))
	return observation, reward, done

	observation = payload
	reward = float(payload.get("reward", 0.0) or 0.0)
	done = bool(payload.get("done", False))
	return observation, reward, done


	# ---------------------------------------------------------------------------
	# Prompt and action helpers
	# ---------------------------------------------------------------------------


	def build_user_prompt(obs: dict[str, Any], step: int, previous_feedback: str = "") -> str:
	tags = ", ".join(obs.get("available_issue_tags") or ISSUE_TAXONOMY)

	prompt_parts = [
	f"TASK ID: {obs.get('task_id', 'unknown')}",
	f"FILE: {obs.get('file_name', 'unknown')}",
	f"STEP: {step} of {MAX_STEPS}",
	f"INSTRUCTION: {obs.get('task_description', 'N/A')}",
	f"\nALLOWED ISSUE TAGS:\n{tags}",
	f"\nCODE UNDER REVIEW:\n{obs.get('code_snippet', '')}",
	]

	# Iterative refinement: include previous feedback so the LLM can improve
	if step > 1 and previous_feedback:
	prompt_parts.append(
	f"\nPREVIOUS STEP FEEDBACK (use this to improve your review):\n{previous_feedback}"
	)

	prompt_parts.append(
	"\nReturn strictly JSON with keys: issues_found, review_comment, severity."
	)

	return "\n".join(prompt_parts)


	def detect_issues_rule_based(code_snippet: str) -> list[str]:
	detected: list[str] = []
	for issue_tag, detector in DETECTION_RULES.items():
	if detector(code_snippet):
	detected.append(issue_tag)
	return detected


	def infer_severity(issues_found: list[str], task_id: str = "") -> str:
	"""Infer severity based on number and type of issues found."""
	security_issues = {"sql_injection", "hardcoded_secret", "path_traversal", "timing_attack"}
	has_security = any(i in security_issues for i in issues_found)

	if len(issues_found) >= 3 or has_security:
	return "critical" if len(issues_found) >= 3 else "high"
	elif len(issues_found) == 2:
	return "high" if has_security else "medium"
	elif len(issues_found) == 1:
	return "medium" if has_security else "low"
	return "low"


	def build_rule_action(code_snippet: str, task_id: str = "") -> dict[str, Any]:
	issues_found = detect_issues_rule_based(code_snippet)
	severity = infer_severity(issues_found, task_id)

	if issues_found:
	# Build keyword-rich comments for quality bonus
	comment_parts = []
	for issue in issues_found:
	if issue == "null_pointer":
	comment_parts.append("Null dereference risk: .get() may return None without check")
	elif issue == "missing_return":
	comment_parts.append("Missing return statement: function never returns a value")
	elif issue == "sql_injection":
	comment_parts.append("SQL injection via f-string query interpolation — use parameterized queries")
	elif issue == "hardcoded_secret":
	comment_parts.append("Hardcoded secret key in plaintext — use environment variables")
	elif issue == "race_condition":
	comment_parts.append("Race condition: non-atomic check-and-modify on shared balance")
	elif issue == "timing_attack":
	comment_parts.append("Timing attack: use hmac.compare_digest for constant-time comparison")
	elif issue == "improper_error_handling":
	comment_parts.append("Bare except silently swallows all errors including payment failures")
	elif issue == "index_out_of_bounds":
	comment_parts.append("Index out of bounds: off-by-one error accessing array past length")
	elif issue == "type_error":
	comment_parts.append("Type error: int() cast on string input without validation may crash")
	elif issue == "integer_overflow":
	comment_parts.append("Integer overflow: arithmetic on large values may wrap or go negative")
	elif issue == "path_traversal":
	comment_parts.append("Path traversal: os.path.join with user input allows directory escape via ../")
	elif issue == "missing_input_validation":
	comment_parts.append("Missing input validation: untrusted user content written without sanitization")
	review_comment = ". ".join(comment_parts) + "."
	else:
	review_comment = "No obvious issues detected from static heuristics."
	severity = "low"

	return {
	"issues_found": issues_found,
	"review_comment": review_comment,
	"severity": severity,
	}


	def extract_json_object(text: str) -> dict[str, Any]:
	if not text:
	raise ValueError("Empty model response")

	stripped = text.strip()
	if stripped.startswith("```"):
	stripped = re.sub(r"^```(?:json)?", "", stripped, flags=re.IGNORECASE).strip()
	stripped = re.sub(r"```$", "", stripped).strip()

	try:
	return json.loads(stripped)
	except json.JSONDecodeError:
	match = re.search(r"\{[\s\S]*\}", stripped)
	if not match:
	raise
	return json.loads(match.group(0))


	def normalize_action(payload: dict[str, Any]) -> dict[str, Any]:
	issues_found_raw = payload.get("issues_found", [])
	if not isinstance(issues_found_raw, list):
	issues_found_raw = []

	issues_found = [str(issue) for issue in issues_found_raw if str(issue) in ISSUE_TAXONOMY]
	review_comment = str(payload.get("review_comment", "")).strip()
	severity = str(payload.get("severity", "medium")).lower()
	if severity not in {"low", "medium", "high", "critical"}:
	severity = "medium"
	if not review_comment:
	review_comment = "Review based on taxonomy-driven static analysis."

	return {
	"issues_found": issues_found,
	"review_comment": review_comment,
	"severity": severity,
	}


	# ---------------------------------------------------------------------------
	# Server readiness
	# ---------------------------------------------------------------------------


	def wait_for_server(timeout: int = 60) -> None:
	for _ in range(timeout):
	try:
	req = urllib.request.Request(f"{ENV_SERVER_URL}/health", method="GET")
	with urllib.request.urlopen(req, timeout=5) as f:
	if f.status == 200:
	return
	except Exception:
	pass
	time.sleep(1)
	raise RuntimeError(f"Server at {ENV_SERVER_URL} not ready after {timeout}s")


	# ---------------------------------------------------------------------------
	# Pure urllib OpenAI-compatible Client
	# ---------------------------------------------------------------------------


	class PureUrllibOpenAIClient:
	"""Fallback OpenAI-compatible client using only stdlib urllib."""

	def __init__(self, base_url: str, api_key: str):
	self.base_url = base_url.rstrip("/")
	self.api_key = api_key

	def create_chat_completion(
	self,
	model: str,
	messages: list[dict[str, str]],
	temperature: float = 0.0,
	max_tokens: int = 1024,
	) -> str:
	url = f"{self.base_url}/chat/completions"
	payload = {
	"model": model,
	"messages": messages,
	"temperature": temperature,
	"max_tokens": max_tokens,
	"stream": False,
	}
	data = json.dumps(payload).encode("utf-8")
	req = urllib.request.Request(url, data=data, method="POST")
	req.add_header("Content-Type", "application/json")
	req.add_header("Authorization", f"Bearer {self.api_key}")

	try:
	with urllib.request.urlopen(req, timeout=60) as response:
	result = json.loads(response.read().decode("utf-8"))
	return result.get("choices", [{}])[0].get("message", {}).get("content", "")
	except urllib.error.HTTPError as e:
	error_body = e.read().decode("utf-8")
	raise RuntimeError(f"HTTP {e.code}: {error_body}")
	except Exception as e:
	raise RuntimeError(f"Proxy request failed: {e}")


	# ---------------------------------------------------------------------------
	# LLM action builder with iterative refinement
	# ---------------------------------------------------------------------------


	def build_llm_action(
	client: Any,
	obs: dict[str, Any],
	step: int,
	previous_feedback: str = "",
	max_retries: int = 3,
	) -> dict[str, Any]:
	user_prompt = build_user_prompt(obs=obs, step=step, previous_feedback=previous_feedback)

	last_error: Optional[Exception] = None
	for attempt in range(max_retries):
	try:
	if isinstance(client, PureUrllibOpenAIClient):
	raw_text = client.create_chat_completion(
	model=MODEL_NAME,
	messages=[
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": user_prompt},
	],
	temperature=TEMPERATURE,
	max_tokens=MAX_TOKENS,
	)
	else:
	response = client.chat.completions.create(
	model=MODEL_NAME,
	messages=[
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": user_prompt},
	],
	temperature=TEMPERATURE,
	max_tokens=MAX_TOKENS,
	stream=False,
	)
	raw_text = response.choices[0].message.content or ""

	return normalize_action(extract_json_object(raw_text))
	except Exception as llm_err:
	last_error = llm_err
	time.sleep(2 ** attempt)

	raise RuntimeError(f"LLM call failed after retries: {last_error}")


	def get_action(
	client: Any,
	obs: dict[str, Any],
	step: int,
	previous_feedback: str = "",
	) -> dict[str, Any]:
	"""Get action from LLM with rule-based fallback."""
	try:
	return build_llm_action(
	client=client, obs=obs, step=step, previous_feedback=previous_feedback,
	)
	except Exception:
	return build_rule_action(
	obs.get("code_snippet", ""), obs.get("task_id", ""),
	)


	# ---------------------------------------------------------------------------
	# Agent loop — one task episode with iterative refinement
	# ---------------------------------------------------------------------------


	def run_task(client: Any, task_id: str) -> None:
	"""Run one task episode with iterative refinement and mandatory logs."""
	log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)

	rewards: list[float] = []
	steps_taken = 0
	final_score = 0.5
	success = False
	previous_feedback = ""

	try:
	reset_payload = env_reset(task_id=task_id)
	obs, reward, done = unwrap_step_payload(reset_payload)

	if reward:
	rewards.append(reward)

	threshold = SUCCESS_THRESHOLDS.get(task_id, 0.95)

	for step in range(1, MAX_STEPS + 1):
	if done:
	break

	# Use previous feedback for iterative refinement
	action_payload = get_action(
	client=client, obs=obs, step=step, previous_feedback=previous_feedback,
	)
	action_str = json.dumps(action_payload, separators=(",", ":"))

	try:
	step_payload = env_step(action=action_payload)
	obs, reward, done = unwrap_step_payload(step_payload)
	rewards.append(reward)
	steps_taken = step

	# Capture feedback for next iteration
	previous_feedback = obs.get("feedback", "")

	log_step(step=step, action=action_str, reward=reward, done=done, error=None)

	if done:
	final_score = reward
	success = final_score >= threshold
	break
	except Exception as step_err:
	steps_taken = step
	log_step(
	step=step, action=action_str, reward=0.0, done=True,
	error=str(step_err),
	)
	break

	if rewards:
	final_score = rewards[-1]
	success = final_score >= threshold

	except Exception:
	success = False

	log_end(success=success, steps=steps_taken, score=final_score, rewards=rewards)


	# ---------------------------------------------------------------------------
	# Main
	# ---------------------------------------------------------------------------


	def main() -> None:
	# Dynamically fetch at runtime to pick up injected env vars
	val_api_base = os.environ.get("API_BASE_URL", "https://router.huggingface.co/v1")
	val_api_key = (
	os.environ.get("API_KEY") or os.environ.get("HF_TOKEN") or "missing-api-key"
	)

	client = None
	try:
	from openai import OpenAI
	client = OpenAI(base_url=val_api_base, api_key=val_api_key)
	except Exception as e:
	print(
	f"[WARN] openai unavailable, using urllib fallback: {e}",
	file=sys.stderr,
	)
	client = PureUrllibOpenAIClient(base_url=val_api_base, api_key=val_api_key)

	wait_for_server(timeout=60)

	for task_id in TASKS:
	run_task(client=client, task_id=task_id)


	if __name__ == "__main__":
	main()