Spaces:

ceoavinash
/

codearena-rl

Sleeping

havinashpatil

Finalizing CodeArena RL Benchmark: frontend improvements, GRPO training scripts, and cleaned environment

03a7eb9 about 1 month ago

13.4 kB

	"""
	CodeArena RL Environment — Production FastAPI entrypoint.
	This is the primary server that Hugging Face Spaces / OpenEnv evaluator hits.
	All endpoints are wrapped with fallback safety so they NEVER return non-200.
	"""

	import random
	import traceback
	from typing import Optional

	from fastapi import FastAPI
	from fastapi.middleware.cors import CORSMiddleware
	from pydantic import BaseModel

	from server.models import CodeArenaObservation, CodeArenaAction, TaskInfo
	from server.executor import run_code_with_tests
	from server.grader import calculate_reward, safe_reward, force_valid_reward
	from server.ai_fixer import generate_fix
	from server.raw_runner import run_raw_code
	from server.memory import store_success, log_complexity_reward, get_complexity_reward_stats, get_all_memories
	from server.algorithm_detector import detect_complexity, detect_problem_type, get_optimization_hint
	from tasks import ALL_TASKS


	# ── Lookup map: difficulty string → list of tasks ──────────────────────────
	TASK_MAP: dict[str, list[TaskInfo]] = {}
	for _t in ALL_TASKS:
	TASK_MAP.setdefault(_t.difficulty, []).append(_t)
	# Also allow lookup by exact task_id (e.g. "easy-1")
	TASK_ID_MAP: dict[str, TaskInfo] = {_t.task_id: _t for _t in ALL_TASKS}


	# ── Request schema ─────────────────────────────────────────────────────────
	class ResetRequest(BaseModel):
	task_id: Optional[str] = "easy"


	# ── Environment state ─────────────────────────────────────────────────────
	class CodeArenaEnv:
	def __init__(self):
	self.tasks = ALL_TASKS
	self.current_task: TaskInfo \| None = None
	self.previous_attempts: list[str] = []
	self.last_error_log = ""
	self.last_test_results = ""
	self.is_done = False
	self.step_count = 0
	self.max_steps = 5
	self.episode_rewards_history: list[float] = []

	def reset(self, task_id: str = "easy") -> CodeArenaObservation:
	if task_id == "auto":
	if not self.episode_rewards_history:
	task_id = "easy"
	else:
	avg_reward = sum(self.episode_rewards_history) / len(self.episode_rewards_history)
	if avg_reward < 0.4:
	task_id = "easy"
	elif avg_reward <= 0.75:
	task_id = "medium"
	else:
	task_id = "hard"

	# Priority: exact task_id match → difficulty match → random
	if task_id in TASK_ID_MAP:
	self.current_task = TASK_ID_MAP[task_id]
	elif task_id in TASK_MAP:
	self.current_task = random.choice(TASK_MAP[task_id])
	else:
	self.current_task = random.choice(self.tasks)

	self.previous_attempts = []
	self.last_error_log = ""
	self.last_test_results = ""
	self.is_done = False
	self.step_count = 0
	return self._state()

	def step(self, action: CodeArenaAction):
	if self.is_done:
	raise ValueError("Environment is done. Call /reset first.")

	self.step_count += 1

	print(f"[DEBUG] Step {self.step_count}: Processing action")
	print(f"[DEBUG] Proposed fix length: {len(action.proposed_fix)} chars")
	print(f"[DEBUG] Proposed fix preview: {action.proposed_fix[:200]}...")

	exec_result = run_code_with_tests(
	code=action.proposed_fix,
	test_code=self.current_task.test_code,
	timeout=max(self.current_task.optimal_time_seconds * 10, 2.0),
	)

	print(f"[DEBUG] Execution result: compile_success={exec_result.compile_success}, test_passed={exec_result.test_passed}/{exec_result.test_total}, exec_time={exec_result.execution_time_seconds:.2f}s")
	if exec_result.runtime_errors:
	print(f"[DEBUG] Runtime errors: {exec_result.runtime_errors[:500]}")

	base_reward, reward_components = calculate_reward(exec_result, self.current_task, action.proposed_fix)

	print(f"[DEBUG] Base reward: {base_reward:.3f}")
	print(f"[DEBUG] Reward components: {reward_components}")

	step_penalty = 0.01 * self.step_count # Reduced from 0.02 for gentler learning
	novelty_penalty = 0.1 if action.proposed_fix in self.previous_attempts else 0.0

	print(f"[DEBUG] Penalties: step={step_penalty:.3f}, novelty={novelty_penalty:.3f}")

	final_reward = base_reward - step_penalty - novelty_penalty
	final_reward = max(0.001, min(0.999, float(final_reward)))

	print(f"[DEBUG] Final reward: {final_reward:.3f}")

	self.previous_attempts.append(action.proposed_fix)
	self.last_error_log = exec_result.runtime_errors
	self.last_test_results = (
	f"{exec_result.test_passed}/{exec_result.test_total} tests passed."
	)

	if final_reward > 0.99 or self.step_count >= self.max_steps:
	self.is_done = True
	self.episode_rewards_history.append(final_reward)
	if len(self.episode_rewards_history) > 5:
	self.episode_rewards_history.pop(0)

	info = {
	"execution_metadata": exec_result.model_dump(),
	"task_id": self.current_task.task_id,
	"reward_components": reward_components,
	"test_results": self.last_test_results,
	"llm_feedback": reward_components.get("feedback", "No feedback provided.")
	}
	return self._state(), final_reward, self.is_done, info

	def _state(self) -> CodeArenaObservation:
	if not self.current_task:
	raise ValueError("Environment not initialised. Call /reset first.")
	return CodeArenaObservation(
	buggy_code=self.current_task.buggy_code,
	error_log=self.last_error_log,
	test_results=self.last_test_results,
	previous_attempts=self.previous_attempts,
	)


	# ── FastAPI app ────────────────────────────────────────────────────────────
	_env = CodeArenaEnv()

	app = FastAPI(title="CodeArena RL Environment")

	# Allow the Vite dev server (port 3000) and any other origin to call this API
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)


	@app.get("/health")
	def health():
	return {"status": "ok", "environment": "CodeArena"}


	@app.post("/reset")
	def api_reset(body: ResetRequest = ResetRequest()):
	"""Reset the environment. NEVER crashes — returns fallback JSON on error."""
	try:
	task_id = body.task_id or "easy"
	obs = _env.reset(task_id=task_id)
	return {
	"status": "success",
	"message": "Environment reset successfully",
	"observation": obs.model_dump(),
	"info": {
	"task_id": _env.current_task.task_id if _env.current_task else "",
	"difficulty": _env.current_task.difficulty if _env.current_task else ""
	}
	}
	except Exception:
	traceback.print_exc()
	return {
	"status": "error",
	"message": "fallback response",
	"observation": {
	"buggy_code": "",
	"error_log": str(traceback.format_exc()),
	"test_results": "",
	"previous_attempts": [],
	},
	}


	@app.post("/step")
	def api_step(action: CodeArenaAction):
	try:
	# Compatibility: support both 'proposed_fix' and 'action'
	fix = action.proposed_fix or action.action
	if not fix:
	return {"status": "error", "message": "No code provided in 'proposed_fix' or 'action'"}

	# Patch the action object to ensure _env.step gets what it expects
	action.proposed_fix = fix

	obs, reward, done, info = _env.step(action)
	# Safety fallback before force_valid_reward
	if reward is None:
	reward = 0.5
	return {
	"observation": obs.model_dump(),
	"reward": force_valid_reward(reward),
	"done": done,
	"info": info,
	}
	except Exception:
	traceback.print_exc()
	return {
	"status": "error",
	"message": "fallback response",
	"observation": {
	"buggy_code": "",
	"error_log": str(traceback.format_exc()),
	"test_results": "",
	"previous_attempts": [],
	},
	"reward": force_valid_reward(0.1),
	"done": True,
	"info": {},
	}


	@app.get("/state")
	def api_state():
	try:
	obs = _env._state()
	return {
	"step": _env.step_count,
	"history": _env.previous_attempts,
	"observation": obs.model_dump()
	}
	except Exception:
	traceback.print_exc()
	return {
	"status": "error",
	"message": "fallback response",
	}


	# ── AI Fix endpoint ───────────────────────────────────────────────────────
	class FixRequest(BaseModel):
	code: str
	error_log: Optional[str] = ""
	ollama_url: Optional[str] = "http://localhost:11434"
	model: Optional[str] = "llama3.2:latest"
	use_ollama: Optional[bool] = True
	reward: Optional[float] = 0.0
	task_id: Optional[str] = ""


	@app.post("/fix")
	def api_fix(body: FixRequest):
	"""Generate a code fix using Ollama (if available) or built-in pattern fixer."""
	try:
	result = generate_fix(
	code=body.code,
	error_log=body.error_log or "",
	ollama_url=body.ollama_url,
	model=body.model,
	use_ollama=body.use_ollama,
	reward=body.reward or 0.0,
	task_id=body.task_id or "",
	)
	return result
	except Exception:
	traceback.print_exc()
	return {
	"fixed_code": body.code,
	"method": "passthrough",
	"success": False,
	"error": traceback.format_exc()
	}


	# ── Raw Runner endpoint (Sandbox) ──────────────────────────────────────────
	class RawRequest(BaseModel):
	code: str

	@app.post("/run_raw")
	def api_run_raw(body: RawRequest):
	"""Run arbitrary code without tests and return output/complexity and reward."""
	try:
	result = run_raw_code(body.code)

	# Calculate simulated reward for sandbox
	# Penalty for errors, slight penalty for extremely high exec time
	reward = 0.95
	reward_components = {"Execution Success": 0.5, "Error Free": 0.45}

	if result.stderr:
	reward = 0.1
	reward_components["Error Free"] = 0.0

	if result.execution_time > 1.0:
	reward -= 0.15
	reward_components["Time Complexity"] = -0.15

	return {
	"status": "success",
	"stdout": result.stdout,
	"stderr": result.stderr,
	"execution_time": result.execution_time,
	"time_complexity_hint": result.time_complexity_hint,
	"reward": force_valid_reward(reward),
	"reward_components": reward_components,
	"done": False # Sandbox mode is never "done" strictly by execution, AI must verify optimality
	}
	except Exception as e:
	traceback.print_exc()
	return {
	"status": "error",
	"stderr": str(e),
	"stdout": "",
	"execution_time": 0,
	"time_complexity_hint": "Error evaluating complexity.",
	"reward": force_valid_reward(0.0),
	"reward_components": {},
	"done": False
	}


	# ── Stats & Memory endpoints (Research Dashboard) ─────────────────────────
	@app.get("/stats")
	def api_stats():
	"""Return complexity vs reward stats from CSV log."""
	try:
	return {
	"complexity_reward_stats": get_complexity_reward_stats(),
	"episode_history": _env.episode_rewards_history,
	"mean_reward": round(sum(_env.episode_rewards_history) / max(1, len(_env.episode_rewards_history)), 3),
	}
	except Exception:
	traceback.print_exc()
	return {"complexity_reward_stats": {}, "episode_history": [], "mean_reward": 0.0}


	@app.get("/memory")
	def api_memory():
	"""Return all stored best solutions from agent memory."""
	try:
	return {"memories": get_all_memories()}
	except Exception:
	return {"memories": {}}


	# ── CLI entrypoint (OpenEnv / script console_scripts) ─────────────────────
	def main():
	"""Run the CodeArena server via uvicorn."""
	import uvicorn
	uvicorn.run("server.app:app", host="0.0.0.0", port=7860)


	if __name__ == "__main__":
	main()