Initial CodeLens OpenEnv submission
Browse files- .dockerignore +2 -2
- .env.example +5 -0
- Dockerfile +27 -10
- README.md +101 -5
- app.py +10 -0
- codelens_env/env.py +22 -1
- codelens_env/models.py +10 -0
- inference.py +9 -8
- openenv.yaml +3 -0
.dockerignore
CHANGED
|
@@ -10,8 +10,8 @@ build/
|
|
| 10 |
*.egg
|
| 11 |
MANIFEST
|
| 12 |
|
| 13 |
-
#
|
| 14 |
-
|
| 15 |
dashboard/node_modules/
|
| 16 |
dashboard/src/
|
| 17 |
dashboard/public/
|
|
|
|
| 10 |
*.egg
|
| 11 |
MANIFEST
|
| 12 |
|
| 13 |
+
# Dashboard Build (Must be built inside Docker to avoid local skew)
|
| 14 |
+
static/dashboard/
|
| 15 |
dashboard/node_modules/
|
| 16 |
dashboard/src/
|
| 17 |
dashboard/public/
|
.env.example
CHANGED
|
@@ -23,3 +23,8 @@ LEADERBOARD_LIMIT=10 # Default entries per task page
|
|
| 23 |
|
| 24 |
# Logging
|
| 25 |
LOG_LEVEL=INFO # DEBUG | INFO | WARNING | ERROR
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
# Logging
|
| 25 |
LOG_LEVEL=INFO # DEBUG | INFO | WARNING | ERROR
|
| 26 |
+
|
| 27 |
+
# Inference (OpenEnv spec)
|
| 28 |
+
OPENAI_API_KEY= # Required for inference.py (OpenAI-compatible API key)
|
| 29 |
+
API_BASE_URL=https://api.openai.com/v1
|
| 30 |
+
MODEL_NAME=gpt-3.5-turbo
|
Dockerfile
CHANGED
|
@@ -1,20 +1,33 @@
|
|
| 1 |
-
# ── Stage 1: Builder ─────────────────────────────────
|
| 2 |
-
FROM
|
| 3 |
|
| 4 |
-
WORKDIR /
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
# Install build dependencies
|
| 7 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 8 |
-
|
| 9 |
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
|
| 11 |
-
# Install Python dependencies into /build/venv
|
| 12 |
COPY requirements.txt .
|
| 13 |
-
RUN python -m venv /build/venv \
|
| 14 |
-
&& /build/venv/bin/pip install --upgrade pip \
|
| 15 |
-
&& /build/venv/bin/pip install --no-cache-dir -r requirements.txt
|
| 16 |
|
| 17 |
-
# ── Stage
|
| 18 |
FROM python:3.11-slim AS production
|
| 19 |
|
| 20 |
# Security: run as non-root user
|
|
@@ -28,7 +41,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
| 28 |
&& rm -rf /var/lib/apt/lists/*
|
| 29 |
|
| 30 |
# Copy virtualenv from builder
|
| 31 |
-
COPY --from=builder /build/venv /app/venv
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
# Copy application code
|
| 34 |
COPY --chown=appuser:appuser . .
|
|
|
|
| 1 |
+
# ── Stage 1: Frontend Builder ─────────────────────────────────
|
| 2 |
+
FROM node:20-slim AS frontend-builder
|
| 3 |
|
| 4 |
+
WORKDIR /src/dashboard
|
| 5 |
+
|
| 6 |
+
# Install dependencies
|
| 7 |
+
COPY dashboard/package*.json ./
|
| 8 |
+
RUN npm install
|
| 9 |
+
|
| 10 |
+
# Copy source and build (vite.config.ts outputs to ../static/dashboard)
|
| 11 |
+
COPY dashboard/ .
|
| 12 |
+
RUN npm run build
|
| 13 |
+
|
| 14 |
+
# ── Stage 2: Python Builder ───────────────────────────────────
|
| 15 |
+
FROM python:3.11-slim AS python-builder
|
| 16 |
+
|
| 17 |
+
WORKDIR /build-python
|
| 18 |
|
| 19 |
# Install build dependencies
|
| 20 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 21 |
+
build-essential \
|
| 22 |
&& rm -rf /var/lib/apt/lists/*
|
| 23 |
|
| 24 |
+
# Install Python dependencies into /build-python/venv
|
| 25 |
COPY requirements.txt .
|
| 26 |
+
RUN python -m venv /build-python/venv \
|
| 27 |
+
&& /build-python/venv/bin/pip install --upgrade pip \
|
| 28 |
+
&& /build-python/venv/bin/pip install --no-cache-dir -r requirements.txt
|
| 29 |
|
| 30 |
+
# ── Stage 3: Production ───────────────────────────────────────
|
| 31 |
FROM python:3.11-slim AS production
|
| 32 |
|
| 33 |
# Security: run as non-root user
|
|
|
|
| 41 |
&& rm -rf /var/lib/apt/lists/*
|
| 42 |
|
| 43 |
# Copy virtualenv from builder
|
| 44 |
+
COPY --from=python-builder /build-python/venv /app/venv
|
| 45 |
+
|
| 46 |
+
# Copy dashboard build from frontend-builder
|
| 47 |
+
# (Vite config builds to ../static/dashboard relative to /src/dashboard)
|
| 48 |
+
COPY --chown=appuser:appuser --from=frontend-builder /src/static/dashboard /app/static/dashboard
|
| 49 |
|
| 50 |
# Copy application code
|
| 51 |
COPY --chown=appuser:appuser . .
|
README.md
CHANGED
|
@@ -1,3 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
<p align="center">
|
| 2 |
<img src="assets/codelens-brand-v2.svg" width="400" alt="CodeLens." />
|
| 3 |
</p>
|
|
@@ -17,6 +28,19 @@ Designed for researchers and developers building the next generation of AI code
|
|
| 17 |
|
| 18 |
---
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
## Quick Start
|
| 21 |
|
| 22 |
Get up and running locally in under 2 minutes:
|
|
@@ -40,11 +64,65 @@ PYTHONPATH=. python app.py
|
|
| 40 |
|
| 41 |
CodeLens benchmarks agents across three critical engineering domains:
|
| 42 |
|
| 43 |
-
| Task | Scenarios | Max Steps | Focus Area |
|
| 44 |
-
| ---------------------- | --------- | --------- | -------------------------------------------------------------------------- |
|
| 45 |
-
| `bug_detection` | 10 | 10 | Off-by-one errors, null dereferences, race conditions, exception handling |
|
| 46 |
-
| `security_audit` | 10 | 15 | SQL injection, hardcoded secrets, path traversal, insecure deserialization |
|
| 47 |
-
| `architectural_review` | 10 | 20 | N+1 queries, god classes, blocking async calls, circular imports |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
---
|
| 50 |
|
|
@@ -71,6 +149,24 @@ Every episode permits **5 false positive credits**. Flagging non-existent code p
|
|
| 71 |
|
| 72 |
---
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
## API Reference
|
| 75 |
|
| 76 |
| Method | Endpoint | Auth | Description |
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: CodeLens Environment
|
| 3 |
+
emoji: 🔍
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
tags:
|
| 9 |
+
- openenv
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
<p align="center">
|
| 13 |
<img src="assets/codelens-brand-v2.svg" width="400" alt="CodeLens." />
|
| 14 |
</p>
|
|
|
|
| 28 |
|
| 29 |
---
|
| 30 |
|
| 31 |
+
## 💡 Motivation
|
| 32 |
+
|
| 33 |
+
Progress in AI coding assistants has largely focused on **generation** (writing code), but **evaluation** (reviewing code) is equally critical for software reliability. Manual code review is a high-cognitive-load, real-world task that requires:
|
| 34 |
+
- **Precision**: Identifying exactly where a bug exists.
|
| 35 |
+
- **Context**: Understanding how a local change affects the whole system.
|
| 36 |
+
- **Security-First Mindset**: Spotting non-obvious vulnerabilities like SQL injection or race conditions.
|
| 37 |
+
|
| 38 |
+
CodeLens transforms these human-centric skills into a **measurable benchmark**, allowing researchers to evaluate agents on their ability to act as high-fidelity gatekeepers of code quality.
|
| 39 |
+
|
| 40 |
+
---
|
| 41 |
+
|
| 42 |
+
---
|
| 43 |
+
|
| 44 |
## Quick Start
|
| 45 |
|
| 46 |
Get up and running locally in under 2 minutes:
|
|
|
|
| 64 |
|
| 65 |
CodeLens benchmarks agents across three critical engineering domains:
|
| 66 |
|
| 67 |
+
| Task | Difficulty | Scenarios | Max Steps | Focus Area |
|
| 68 |
+
| ---------------------- | ---------- | --------- | --------- | -------------------------------------------------------------------------- |
|
| 69 |
+
| `bug_detection` | **Easy** | 10 | 10 | Off-by-one errors, null dereferences, race conditions, exception handling |
|
| 70 |
+
| `security_audit` | **Medium** | 10 | 15 | SQL injection, hardcoded secrets, path traversal, insecure deserialization |
|
| 71 |
+
| `architectural_review` | **Hard** | 10 | 20 | N+1 queries, god classes, blocking async calls, circular imports |
|
| 72 |
+
|
| 73 |
+
---
|
| 74 |
+
|
| 75 |
+
## 🎯 Observation Space
|
| 76 |
+
|
| 77 |
+
Each `step()` and `reset()` call returns a typed `Observation` object:
|
| 78 |
+
|
| 79 |
+
| Field | Type | Description |
|
| 80 |
+
| ---------------- | ----------------- | ---------------------------------------------- |
|
| 81 |
+
| `task_id` | `TaskId` (enum) | One of `bug_detection`, `security_audit`, `architectural_review` |
|
| 82 |
+
| `scenario_hash` | `str` | Deterministic identifier for the scenario |
|
| 83 |
+
| `pr_title` | `str` | Title of the synthetic pull request |
|
| 84 |
+
| `pr_description` | `str` | Description/context for the PR |
|
| 85 |
+
| `diff` | `str` | Full unified diff (all files concatenated) |
|
| 86 |
+
| `files_changed` | `List[FileChanged]` | Structured file patches with metadata |
|
| 87 |
+
| `step_count` | `int` | Current step number (0-indexed) |
|
| 88 |
+
| `max_steps` | `int` | Maximum steps allowed for this task |
|
| 89 |
+
| `noise_budget` | `int` | Remaining false-positive credits (starts at 5) |
|
| 90 |
+
| `issues_flagged` | `int` | Number of correctly matched issues so far |
|
| 91 |
+
| `done` | `bool` | Whether the episode has terminated |
|
| 92 |
+
|
| 93 |
+
## 🎮 Action Space
|
| 94 |
+
|
| 95 |
+
Agents submit typed `Action` objects with the following fields:
|
| 96 |
+
|
| 97 |
+
| Field | Type | Required For | Description |
|
| 98 |
+
| --------------- | ------------------ | ------------------- | -------------------------------------------- |
|
| 99 |
+
| `action_type` | `ActionType` (enum)| All actions | `flag_issue`, `approve`, `request_changes`, `comment`, `ask_question` |
|
| 100 |
+
| `body` | `str` | All actions | Description or explanation text |
|
| 101 |
+
| `filename` | `str` | `flag_issue` | File containing the issue |
|
| 102 |
+
| `line_number` | `int` | `flag_issue` | Approximate line number of the issue |
|
| 103 |
+
| `category` | `Category` (enum) | `flag_issue` | `bug`, `security`, `architecture`, `style`, `performance` |
|
| 104 |
+
| `severity` | `Severity` (enum) | `flag_issue` | `critical`, `high`, `medium`, `low`, `info` |
|
| 105 |
+
| `verdict` | `Verdict` (enum) | `approve` / `request_changes` | `lgtm`, `request_changes`, `needs_discussion` |
|
| 106 |
+
|
| 107 |
+
### Reward Signal
|
| 108 |
+
|
| 109 |
+
Each `step()` returns a typed `Reward` object:
|
| 110 |
+
|
| 111 |
+
| Field | Type | Description |
|
| 112 |
+
| -------------- | ------- | ------------------------------------------------ |
|
| 113 |
+
| `value` | `float` | Normalised score (0.0–1.0) |
|
| 114 |
+
| `reason` | `str` | Human-readable explanation of the reward |
|
| 115 |
+
| `is_terminal` | `bool` | `True` on the final step of an episode |
|
| 116 |
+
|
| 117 |
+
**Reward shaping:** Correct issue flags yield positive rewards scaled by severity (critical=1.0, high=0.8, medium=0.5, low=0.2). False positives and duplicates incur −0.05 penalties and consume noise budget. Episodes terminate when noise budget reaches zero, max steps are exceeded, or a terminal action (approve/request_changes) is submitted.
|
| 118 |
+
|
| 119 |
+
### 🧠 Environment Design Highlights
|
| 120 |
+
|
| 121 |
+
- **Predictable State Management**: The `reset()` and `step()` functions are strictly idempotent based on task/seed pairs, ensuring 100% reproducible episodes.
|
| 122 |
+
- **Dense Reward Signal**: Unlike "win/loss" environments, CodeLens provides continuous feedback. Every action—from the first issue flagged to the final verdict—produces a typed `Reward` object with human-readable rationale, accelerating agent learning (process supervision).
|
| 123 |
+
- **Novelty: The Reviewer Trust Mechanic**: The **Noise Budget** (5 credits) simulates real-world developer trust. If an agent "hallucinates" too many non-existent bugs, it loses the budget and the episode is terminated, penalizing high-volume, low-precision behavior.
|
| 124 |
+
|
| 125 |
+
---
|
| 126 |
|
| 127 |
---
|
| 128 |
|
|
|
|
| 149 |
|
| 150 |
---
|
| 151 |
|
| 152 |
+
## 📊 Baseline Scores
|
| 153 |
+
|
| 154 |
+
Reproducible keyword-based baseline results across all 30 scenarios (10 seeds per task):
|
| 155 |
+
|
| 156 |
+
| Task | Mean Score | Best Score | Worst Score | Success Rate (>0.5) |
|
| 157 |
+
| ---------------------- | ---------- | ---------- | ----------- | ------------------- |
|
| 158 |
+
| `bug_detection` | 0.3577 | 0.9167 | 0.0000 | 40% |
|
| 159 |
+
| `security_audit` | 0.1850 | 1.0000 | 0.0000 | 20% |
|
| 160 |
+
| `architectural_review` | 0.2930 | 0.6640 | 0.0000 | 40% |
|
| 161 |
+
| **Overall** | **0.2786** | — | — | **33%** |
|
| 162 |
+
|
| 163 |
+
> **Agent:** `KeywordAgent` (heuristic, 35+ rules) — see `scripts/baseline.py`
|
| 164 |
+
> **Reproduce:** `python scripts/evaluate.py --agent keyword --output results.json`
|
| 165 |
+
|
| 166 |
+
These scores represent a deterministic lower bound. LLM-powered agents (e.g., GPT-4o, Claude) are expected to significantly outperform this baseline.
|
| 167 |
+
|
| 168 |
+
---
|
| 169 |
+
|
| 170 |
## API Reference
|
| 171 |
|
| 172 |
| Method | Endpoint | Auth | Description |
|
app.py
CHANGED
|
@@ -193,6 +193,16 @@ async def http_exception_handler(request, exc):
|
|
| 193 |
|
| 194 |
# ── Endpoints ─────────────────────────────────────────────────────────────────
|
| 195 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
@app.get("/health")
|
| 197 |
def health_check():
|
| 198 |
return {
|
|
|
|
| 193 |
|
| 194 |
# ── Endpoints ─────────────────────────────────────────────────────────────────
|
| 195 |
|
| 196 |
+
@app.get("/", include_in_schema=False)
|
| 197 |
+
def root():
|
| 198 |
+
"""Absolute root ping handler for infrastructure readiness checks."""
|
| 199 |
+
return {
|
| 200 |
+
"status": "ready",
|
| 201 |
+
"message": "CodeLens API is operational.",
|
| 202 |
+
"docs": "/docs",
|
| 203 |
+
"health": "/health"
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
@app.get("/health")
|
| 207 |
def health_check():
|
| 208 |
return {
|
codelens_env/env.py
CHANGED
|
@@ -2,7 +2,7 @@ from datetime import datetime, timezone
|
|
| 2 |
from typing import List, Optional, Set
|
| 3 |
from codelens_env.models import (
|
| 4 |
TaskId, Action, Observation, StepResult, ResetResult,
|
| 5 |
-
ActionType, ActionRecord, EpisodeResult, Severity, GroundTruthIssue
|
| 6 |
)
|
| 7 |
from codelens_env.scenarios import get_scenario
|
| 8 |
from codelens_env.graders.bug_grader import grade_bug_detection
|
|
@@ -61,6 +61,7 @@ class CodeLensEnv:
|
|
| 61 |
|
| 62 |
self.step_count += 1
|
| 63 |
reward = 0.0
|
|
|
|
| 64 |
|
| 65 |
# Determine terminal state and reward
|
| 66 |
if action.action_type in (ActionType.APPROVE, ActionType.REQUEST_CHANGES):
|
|
@@ -100,6 +101,21 @@ class CodeLensEnv:
|
|
| 100 |
self.done = True
|
| 101 |
self.terminated_reason = "max_steps"
|
| 102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
# Record action
|
| 104 |
record = ActionRecord(
|
| 105 |
action_type=action.action_type,
|
|
@@ -117,6 +133,11 @@ class CodeLensEnv:
|
|
| 117 |
return StepResult(
|
| 118 |
observation=self._build_observation(),
|
| 119 |
reward=float(reward),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
done=self.done,
|
| 121 |
info={"terminated_reason": self.terminated_reason}
|
| 122 |
)
|
|
|
|
| 2 |
from typing import List, Optional, Set
|
| 3 |
from codelens_env.models import (
|
| 4 |
TaskId, Action, Observation, StepResult, ResetResult,
|
| 5 |
+
ActionType, ActionRecord, EpisodeResult, Severity, GroundTruthIssue, Reward
|
| 6 |
)
|
| 7 |
from codelens_env.scenarios import get_scenario
|
| 8 |
from codelens_env.graders.bug_grader import grade_bug_detection
|
|
|
|
| 61 |
|
| 62 |
self.step_count += 1
|
| 63 |
reward = 0.0
|
| 64 |
+
match = None # Track matched ground truth issue (if any)
|
| 65 |
|
| 66 |
# Determine terminal state and reward
|
| 67 |
if action.action_type in (ActionType.APPROVE, ActionType.REQUEST_CHANGES):
|
|
|
|
| 101 |
self.done = True
|
| 102 |
self.terminated_reason = "max_steps"
|
| 103 |
|
| 104 |
+
# Build reward reason
|
| 105 |
+
if action.action_type in (ActionType.APPROVE, ActionType.REQUEST_CHANGES):
|
| 106 |
+
reward_reason = "Terminal action submitted"
|
| 107 |
+
elif action.action_type == ActionType.FLAG_ISSUE:
|
| 108 |
+
if match and match.id in self.matched_issue_ids and reward > 0:
|
| 109 |
+
reward_reason = f"Correctly identified issue: {match.description[:60]}"
|
| 110 |
+
elif match and reward < 0:
|
| 111 |
+
reward_reason = "Duplicate issue flagged"
|
| 112 |
+
elif not match:
|
| 113 |
+
reward_reason = "False positive: no matching ground truth issue"
|
| 114 |
+
else:
|
| 115 |
+
reward_reason = f"Matched issue {match.id}" if match else "No match"
|
| 116 |
+
else:
|
| 117 |
+
reward_reason = "Non-scoring action"
|
| 118 |
+
|
| 119 |
# Record action
|
| 120 |
record = ActionRecord(
|
| 121 |
action_type=action.action_type,
|
|
|
|
| 133 |
return StepResult(
|
| 134 |
observation=self._build_observation(),
|
| 135 |
reward=float(reward),
|
| 136 |
+
reward_info=Reward(
|
| 137 |
+
value=float(max(0.0, reward)),
|
| 138 |
+
reason=reward_reason,
|
| 139 |
+
is_terminal=self.done
|
| 140 |
+
),
|
| 141 |
done=self.done,
|
| 142 |
info={"terminated_reason": self.terminated_reason}
|
| 143 |
)
|
codelens_env/models.py
CHANGED
|
@@ -113,6 +113,15 @@ class Observation(BaseModel):
|
|
| 113 |
issues_flagged: int = 0
|
| 114 |
done: bool = False
|
| 115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
class ResetResult(BaseModel):
|
| 117 |
task_id: TaskId
|
| 118 |
seed: int
|
|
@@ -122,6 +131,7 @@ class ResetResult(BaseModel):
|
|
| 122 |
class StepResult(BaseModel):
|
| 123 |
observation: Observation
|
| 124 |
reward: float
|
|
|
|
| 125 |
done: bool
|
| 126 |
info: dict = {}
|
| 127 |
|
|
|
|
| 113 |
issues_flagged: int = 0
|
| 114 |
done: bool = False
|
| 115 |
|
| 116 |
+
class Reward(BaseModel):
|
| 117 |
+
"""
|
| 118 |
+
Typed reward signal returned at each step (OpenEnv spec).
|
| 119 |
+
All values are normalized in the 0.0 – 1.0 range.
|
| 120 |
+
"""
|
| 121 |
+
value: float # 0.0 – 1.0 normalised score
|
| 122 |
+
reason: str = "" # human-readable explanation
|
| 123 |
+
is_terminal: bool = False # True on the final step
|
| 124 |
+
|
| 125 |
class ResetResult(BaseModel):
|
| 126 |
task_id: TaskId
|
| 127 |
seed: int
|
|
|
|
| 131 |
class StepResult(BaseModel):
|
| 132 |
observation: Observation
|
| 133 |
reward: float
|
| 134 |
+
reward_info: Reward # typed Reward model (OpenEnv spec)
|
| 135 |
done: bool
|
| 136 |
info: dict = {}
|
| 137 |
|
inference.py
CHANGED
|
@@ -2,12 +2,12 @@
|
|
| 2 |
CodeLens Inference Script — CodeLens Environment
|
| 3 |
==========================================================
|
| 4 |
Required env vars:
|
| 5 |
-
API_BASE_URL
|
| 6 |
-
MODEL_NAME
|
| 7 |
-
HF_TOKEN
|
| 8 |
-
ENV_URL
|
| 9 |
|
| 10 |
-
Output format (stdout, per
|
| 11 |
[START] task=<task_id> env=<env_url> model=<model>
|
| 12 |
[STEP] step=<n> action=<str> reward=<float> done=<bool> error=<str|None>
|
| 13 |
[END] success=<bool> steps=<int> score=<float> rewards=<list>
|
|
@@ -20,10 +20,11 @@ import time
|
|
| 20 |
import requests
|
| 21 |
from openai import OpenAI
|
| 22 |
|
| 23 |
-
# ── Environment Variables (exact names required by
|
| 24 |
API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
|
| 25 |
MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-3.5-turbo")
|
| 26 |
-
HF_TOKEN
|
|
|
|
| 27 |
ENV_URL = os.environ.get("ENV_URL", "http://localhost:7860")
|
| 28 |
|
| 29 |
# ── Config ────────────────────────────────────────────────────────────────────
|
|
@@ -51,7 +52,7 @@ def log_step(step: int, action: str, reward: float, done: bool, error):
|
|
| 51 |
|
| 52 |
def log_end(success: bool, steps: int, score: float, rewards: list):
|
| 53 |
success_str = "true" if success else "false"
|
| 54 |
-
rewards_str = ",".join([f"{r:.2f}" for r in rewards])
|
| 55 |
print(
|
| 56 |
f"[END] success={success_str} steps={steps} score={score:.2f} "
|
| 57 |
f"rewards={rewards_str}",
|
|
|
|
| 2 |
CodeLens Inference Script — CodeLens Environment
|
| 3 |
==========================================================
|
| 4 |
Required env vars:
|
| 5 |
+
API_BASE_URL — OpenAI-compatible base URL (e.g. https://api.openai.com/v1)
|
| 6 |
+
MODEL_NAME — Model identifier (e.g. gpt-4o, gpt-3.5-turbo)
|
| 7 |
+
HF_TOKEN — API key (Hugging Face / OpenAI compatible)
|
| 8 |
+
ENV_URL — CodeLens env URL (default: http://localhost:7860)
|
| 9 |
|
| 10 |
+
Output format (stdout, per OpenEnv spec):
|
| 11 |
[START] task=<task_id> env=<env_url> model=<model>
|
| 12 |
[STEP] step=<n> action=<str> reward=<float> done=<bool> error=<str|None>
|
| 13 |
[END] success=<bool> steps=<int> score=<float> rewards=<list>
|
|
|
|
| 20 |
import requests
|
| 21 |
from openai import OpenAI
|
| 22 |
|
| 23 |
+
# ── Environment Variables (exact names required by hackathon) ──────────────────
|
| 24 |
API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
|
| 25 |
MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-3.5-turbo")
|
| 26 |
+
# Dual support: HF_TOKEN (mandatory instructions) or OPENAI_API_KEY (functional reqs)
|
| 27 |
+
HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("OPENAI_API_KEY", "dummy")
|
| 28 |
ENV_URL = os.environ.get("ENV_URL", "http://localhost:7860")
|
| 29 |
|
| 30 |
# ── Config ────────────────────────────────────────────────────────────────────
|
|
|
|
| 52 |
|
| 53 |
def log_end(success: bool, steps: int, score: float, rewards: list):
|
| 54 |
success_str = "true" if success else "false"
|
| 55 |
+
rewards_str = "[" + ",".join([f"{r:.2f}" for r in rewards]) + "]"
|
| 56 |
print(
|
| 57 |
f"[END] success={success_str} steps={steps} score={score:.2f} "
|
| 58 |
f"rewards={rewards_str}",
|
openenv.yaml
CHANGED
|
@@ -9,6 +9,9 @@ description: >
|
|
| 9 |
entry_point: "app:app"
|
| 10 |
dashboard: "/dashboard"
|
| 11 |
api_docs: "/docs"
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
tasks:
|
| 14 |
- id: "bug_detection"
|
|
|
|
| 9 |
entry_point: "app:app"
|
| 10 |
dashboard: "/dashboard"
|
| 11 |
api_docs: "/docs"
|
| 12 |
+
license: "MIT"
|
| 13 |
+
tags: ["code-review", "agentic-eval", "security-audit", "bug-detection"]
|
| 14 |
+
contact: "Arsh Verma <arsh@example.com>"
|
| 15 |
|
| 16 |
tasks:
|
| 17 |
- id: "bug_detection"
|