Skip DB init in TESTING; add reset_db
Browse filesAvoid initializing the SQLite DB during tests by checking the TESTING env var in app.lifespan and logging accordingly. Add scripts/reset_db.py to remove and re-create the DB file for fresh state during development/benchmarks. Update tests: conftest sets TESTING=true and test_api uses a shared client fixture instead of creating TestClient per test. Update results.json sample data (IDs and durations) as part of test/sample data refresh.
- DEPLOYMENT.md +63 -0
- app.py +6 -2
- codelens_env/config.py +2 -0
- codelens_env/database.py +14 -0
- dashboard/vercel.json +15 -0
- results.json +46 -46
- scripts/reset_db.py +47 -0
- tests/conftest.py +2 -0
- tests/test_api.py +4 -8
DEPLOYMENT.md
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CodeLens. Deployment Guide (Production)
|
| 2 |
+
|
| 3 |
+
Follow this guide to deploy **CodeLens. v1.0.0** to the professional cloud. This configuration uses **Vercel** for the frontend, **Render** for the backend, and **Supabase/Neon** for the PostgreSQL database.
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## 1. 🗄️ Setup the Database (PostgreSQL)
|
| 8 |
+
|
| 9 |
+
Since SQLite is disk-based and will be deleted at every restart on Render/Vercel, you **must** use a managed PostgreSQL service.
|
| 10 |
+
|
| 11 |
+
1. **Go to [Supabase](https://supabase.com)** or [Neon](https://neon.tech).
|
| 12 |
+
2. **Create a new Project** called "CodeLens".
|
| 13 |
+
3. **Copy your Connection String** (it should look like `postgres://user:pass@host:5432/dbname`).
|
| 14 |
+
4. **Important**: Keep this URL safe—it is your `DATABASE_URL`.
|
| 15 |
+
|
| 16 |
+
---
|
| 17 |
+
|
| 18 |
+
## 2. 🚀 Setup the Backend (Render)
|
| 19 |
+
|
| 20 |
+
Render will host your FastAPI API and your Dockerized environment.
|
| 21 |
+
|
| 22 |
+
1. **Go to [Render Dashboard](https://dashboard.render.com)**.
|
| 23 |
+
2. **New -> Web Service** and connect your GitHub repository.
|
| 24 |
+
3. **Configure**:
|
| 25 |
+
- **Runtime**: `Docker`.
|
| 26 |
+
- **Environment Variables**:
|
| 27 |
+
- `DATABASE_URL`: (Paste your Supabase/Neon URL here).
|
| 28 |
+
- `API_KEY_ENABLED`: `true` (highly recommended for production).
|
| 29 |
+
- `API_KEY`: A strong secret password.
|
| 30 |
+
- `APP_ENV`: `production`.
|
| 31 |
+
4. **Deploy**: Render will automatically build the `Dockerfile` in the root and start the service.
|
| 32 |
+
5. **Identify**: Copy your Render URL (e.g., `https://codelens-api.onrender.com`).
|
| 33 |
+
|
| 34 |
+
---
|
| 35 |
+
|
| 36 |
+
## 3. 🎨 Setup the Frontend (Vercel)
|
| 37 |
+
|
| 38 |
+
Vercel will host your React/Vite dashboard.
|
| 39 |
+
|
| 40 |
+
1. **Go to [Vercel](https://vercel.com)**.
|
| 41 |
+
2. **Import** your `dashboard` folder (or the whole repository and set the root directory to `dashboard`).
|
| 42 |
+
3. **Update `vercel.json`**:
|
| 43 |
+
- Open [`dashboard/vercel.json`](file:///Users/arshverma/GitHub/open-ev-code-handler/dashboard/vercel.json).
|
| 44 |
+
- Replace `https://YOUR_BACKEND_URL.render.com` with your **real** Render URL.
|
| 45 |
+
4. **Deploy**: Vercel will build the React application and provide a global dashboard link.
|
| 46 |
+
|
| 47 |
+
---
|
| 48 |
+
|
| 49 |
+
## 4. 🤖 Running Remote Evaluations
|
| 50 |
+
|
| 51 |
+
Once deployed, you can run the benchmark script from your local machine (or any CI) against your **production** instance:
|
| 52 |
+
|
| 53 |
+
```bash
|
| 54 |
+
python scripts/evaluate.py --url https://your-render-url.com --api-key YOUR_SECRET_KEY
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
---
|
| 58 |
+
|
| 59 |
+
> [!CAUTION]
|
| 60 |
+
> **Database Migrations**: When you first deploy to a new PostgreSQL instance, the tables will be empty. The first request to the API will automatically trigger `create_db_and_tables()` via the lifespan hook—no manual SQL is required.
|
| 61 |
+
|
| 62 |
+
> [!TIP]
|
| 63 |
+
> **Vercel Rewrites**: The `vercel.json` rewrite rule is what allows the frontend to talk to the backend without CORS issues. Ensure the URL is exactly correct.
|
app.py
CHANGED
|
@@ -42,9 +42,13 @@ logger = logging.getLogger("codelens_env")
|
|
| 42 |
@asynccontextmanager
|
| 43 |
async def lifespan(app: FastAPI):
|
| 44 |
# Startup
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
cleanup_task = asyncio.create_task(cleanup_expired_episodes())
|
| 47 |
-
logger.info(f"CodeLens API started — DB at {settings.db_path}")
|
| 48 |
|
| 49 |
yield
|
| 50 |
|
|
|
|
| 42 |
@asynccontextmanager
|
| 43 |
async def lifespan(app: FastAPI):
|
| 44 |
# Startup
|
| 45 |
+
if not os.getenv("TESTING"):
|
| 46 |
+
create_db_and_tables()
|
| 47 |
+
logger.info(f"CodeLens API started — DB at {settings.db_path}")
|
| 48 |
+
else:
|
| 49 |
+
logger.info("CodeLens API running in TESTING mode — DB initialization skipped")
|
| 50 |
+
|
| 51 |
cleanup_task = asyncio.create_task(cleanup_expired_episodes())
|
|
|
|
| 52 |
|
| 53 |
yield
|
| 54 |
|
codelens_env/config.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
from functools import lru_cache
|
|
|
|
| 2 |
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 3 |
|
| 4 |
class Settings(BaseSettings):
|
|
@@ -19,6 +20,7 @@ class Settings(BaseSettings):
|
|
| 19 |
rate_limit_per_minute: int = 60 # requests per minute per IP
|
| 20 |
|
| 21 |
# Persistence
|
|
|
|
| 22 |
db_path: str = "./data/codelens.db"
|
| 23 |
db_echo: bool = False # Set True to log all SQL queries
|
| 24 |
|
|
|
|
| 1 |
from functools import lru_cache
|
| 2 |
+
from typing import Optional
|
| 3 |
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 4 |
|
| 5 |
class Settings(BaseSettings):
|
|
|
|
| 20 |
rate_limit_per_minute: int = 60 # requests per minute per IP
|
| 21 |
|
| 22 |
# Persistence
|
| 23 |
+
database_url: Optional[str] = None
|
| 24 |
db_path: str = "./data/codelens.db"
|
| 25 |
db_echo: bool = False # Set True to log all SQL queries
|
| 26 |
|
codelens_env/database.py
CHANGED
|
@@ -7,6 +7,20 @@ from codelens_env.models import EpisodeResult, TaskId
|
|
| 7 |
|
| 8 |
def get_engine():
|
| 9 |
settings = get_settings()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
Path(settings.db_path).parent.mkdir(parents=True, exist_ok=True)
|
| 11 |
return create_engine(
|
| 12 |
f"sqlite:///{settings.db_path}",
|
|
|
|
| 7 |
|
| 8 |
def get_engine():
|
| 9 |
settings = get_settings()
|
| 10 |
+
|
| 11 |
+
if settings.database_url:
|
| 12 |
+
# Support Render/Heroku 'postgres://' URLs by converting to 'postgresql://'
|
| 13 |
+
url = settings.database_url
|
| 14 |
+
if url.startswith("postgres://"):
|
| 15 |
+
url = url.replace("postgres://", "postgresql://", 1)
|
| 16 |
+
|
| 17 |
+
return create_engine(
|
| 18 |
+
url,
|
| 19 |
+
echo=settings.db_echo,
|
| 20 |
+
pool_pre_ping=True, # Ensure connections are alive
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
# Fallback to local SQLite
|
| 24 |
Path(settings.db_path).parent.mkdir(parents=True, exist_ok=True)
|
| 25 |
return create_engine(
|
| 26 |
f"sqlite:///{settings.db_path}",
|
dashboard/vercel.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"rewrites": [
|
| 3 |
+
{
|
| 4 |
+
"source": "/api/(.*)",
|
| 5 |
+
"destination": "https://YOUR_BACKEND_URL.render.com/$1"
|
| 6 |
+
},
|
| 7 |
+
{
|
| 8 |
+
"source": "/ws/(.*)",
|
| 9 |
+
"destination": "wss://YOUR_BACKEND_URL.render.com/ws/$1"
|
| 10 |
+
}
|
| 11 |
+
],
|
| 12 |
+
"framework": "vite",
|
| 13 |
+
"buildCommand": "npm run build",
|
| 14 |
+
"outputDirectory": "dist"
|
| 15 |
+
}
|
results.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"episode_id": "
|
| 4 |
"task_id": "bug_detection",
|
| 5 |
"seed": 0,
|
| 6 |
"final_score": 0.0,
|
|
@@ -9,10 +9,10 @@
|
|
| 9 |
"issues_total": 1,
|
| 10 |
"noise_penalties": 0,
|
| 11 |
"terminated_reason": "terminal_action",
|
| 12 |
-
"duration_seconds": 0.
|
| 13 |
},
|
| 14 |
{
|
| 15 |
-
"episode_id": "
|
| 16 |
"task_id": "bug_detection",
|
| 17 |
"seed": 1,
|
| 18 |
"final_score": 0.0,
|
|
@@ -24,7 +24,7 @@
|
|
| 24 |
"duration_seconds": 0.02
|
| 25 |
},
|
| 26 |
{
|
| 27 |
-
"episode_id": "
|
| 28 |
"task_id": "bug_detection",
|
| 29 |
"seed": 2,
|
| 30 |
"final_score": 0.9167,
|
|
@@ -33,10 +33,10 @@
|
|
| 33 |
"issues_total": 1,
|
| 34 |
"noise_penalties": 5,
|
| 35 |
"terminated_reason": "noise_exhausted",
|
| 36 |
-
"duration_seconds": 0.
|
| 37 |
},
|
| 38 |
{
|
| 39 |
-
"episode_id": "
|
| 40 |
"task_id": "bug_detection",
|
| 41 |
"seed": 3,
|
| 42 |
"final_score": 0.9167,
|
|
@@ -45,10 +45,10 @@
|
|
| 45 |
"issues_total": 1,
|
| 46 |
"noise_penalties": 5,
|
| 47 |
"terminated_reason": "noise_exhausted",
|
| 48 |
-
"duration_seconds": 0.
|
| 49 |
},
|
| 50 |
{
|
| 51 |
-
"episode_id": "
|
| 52 |
"task_id": "bug_detection",
|
| 53 |
"seed": 4,
|
| 54 |
"final_score": 0.8267,
|
|
@@ -60,7 +60,7 @@
|
|
| 60 |
"duration_seconds": 0.03
|
| 61 |
},
|
| 62 |
{
|
| 63 |
-
"episode_id": "
|
| 64 |
"task_id": "bug_detection",
|
| 65 |
"seed": 5,
|
| 66 |
"final_score": 0.0,
|
|
@@ -69,10 +69,10 @@
|
|
| 69 |
"issues_total": 1,
|
| 70 |
"noise_penalties": 0,
|
| 71 |
"terminated_reason": "terminal_action",
|
| 72 |
-
"duration_seconds": 0.
|
| 73 |
},
|
| 74 |
{
|
| 75 |
-
"episode_id": "
|
| 76 |
"task_id": "bug_detection",
|
| 77 |
"seed": 6,
|
| 78 |
"final_score": 0.0,
|
|
@@ -84,7 +84,7 @@
|
|
| 84 |
"duration_seconds": 0.02
|
| 85 |
},
|
| 86 |
{
|
| 87 |
-
"episode_id": "
|
| 88 |
"task_id": "bug_detection",
|
| 89 |
"seed": 7,
|
| 90 |
"final_score": 0.0,
|
|
@@ -96,7 +96,7 @@
|
|
| 96 |
"duration_seconds": 0.02
|
| 97 |
},
|
| 98 |
{
|
| 99 |
-
"episode_id": "
|
| 100 |
"task_id": "bug_detection",
|
| 101 |
"seed": 8,
|
| 102 |
"final_score": 0.9167,
|
|
@@ -105,10 +105,10 @@
|
|
| 105 |
"issues_total": 1,
|
| 106 |
"noise_penalties": 5,
|
| 107 |
"terminated_reason": "noise_exhausted",
|
| 108 |
-
"duration_seconds": 0.
|
| 109 |
},
|
| 110 |
{
|
| 111 |
-
"episode_id": "
|
| 112 |
"task_id": "bug_detection",
|
| 113 |
"seed": 9,
|
| 114 |
"final_score": 0.0,
|
|
@@ -120,7 +120,7 @@
|
|
| 120 |
"duration_seconds": 0.03
|
| 121 |
},
|
| 122 |
{
|
| 123 |
-
"episode_id": "
|
| 124 |
"task_id": "security_audit",
|
| 125 |
"seed": 0,
|
| 126 |
"final_score": 0.0,
|
|
@@ -132,7 +132,7 @@
|
|
| 132 |
"duration_seconds": 0.03
|
| 133 |
},
|
| 134 |
{
|
| 135 |
-
"episode_id": "
|
| 136 |
"task_id": "security_audit",
|
| 137 |
"seed": 1,
|
| 138 |
"final_score": 1.0,
|
|
@@ -141,10 +141,10 @@
|
|
| 141 |
"issues_total": 1,
|
| 142 |
"noise_penalties": 5,
|
| 143 |
"terminated_reason": "noise_exhausted",
|
| 144 |
-
"duration_seconds": 0.
|
| 145 |
},
|
| 146 |
{
|
| 147 |
-
"episode_id": "
|
| 148 |
"task_id": "security_audit",
|
| 149 |
"seed": 2,
|
| 150 |
"final_score": 0.0,
|
|
@@ -156,7 +156,7 @@
|
|
| 156 |
"duration_seconds": 0.03
|
| 157 |
},
|
| 158 |
{
|
| 159 |
-
"episode_id": "
|
| 160 |
"task_id": "security_audit",
|
| 161 |
"seed": 3,
|
| 162 |
"final_score": 0.85,
|
|
@@ -165,10 +165,10 @@
|
|
| 165 |
"issues_total": 1,
|
| 166 |
"noise_penalties": 5,
|
| 167 |
"terminated_reason": "noise_exhausted",
|
| 168 |
-
"duration_seconds": 0.
|
| 169 |
},
|
| 170 |
{
|
| 171 |
-
"episode_id": "
|
| 172 |
"task_id": "security_audit",
|
| 173 |
"seed": 4,
|
| 174 |
"final_score": 0.0,
|
|
@@ -180,7 +180,7 @@
|
|
| 180 |
"duration_seconds": 0.03
|
| 181 |
},
|
| 182 |
{
|
| 183 |
-
"episode_id": "
|
| 184 |
"task_id": "security_audit",
|
| 185 |
"seed": 5,
|
| 186 |
"final_score": 0.0,
|
|
@@ -189,10 +189,10 @@
|
|
| 189 |
"issues_total": 1,
|
| 190 |
"noise_penalties": 5,
|
| 191 |
"terminated_reason": "noise_exhausted",
|
| 192 |
-
"duration_seconds": 0.
|
| 193 |
},
|
| 194 |
{
|
| 195 |
-
"episode_id": "
|
| 196 |
"task_id": "security_audit",
|
| 197 |
"seed": 6,
|
| 198 |
"final_score": 0.0,
|
|
@@ -204,7 +204,7 @@
|
|
| 204 |
"duration_seconds": 0.03
|
| 205 |
},
|
| 206 |
{
|
| 207 |
-
"episode_id": "
|
| 208 |
"task_id": "security_audit",
|
| 209 |
"seed": 7,
|
| 210 |
"final_score": 0.0,
|
|
@@ -216,7 +216,7 @@
|
|
| 216 |
"duration_seconds": 0.03
|
| 217 |
},
|
| 218 |
{
|
| 219 |
-
"episode_id": "
|
| 220 |
"task_id": "security_audit",
|
| 221 |
"seed": 8,
|
| 222 |
"final_score": 0.0,
|
|
@@ -225,10 +225,10 @@
|
|
| 225 |
"issues_total": 1,
|
| 226 |
"noise_penalties": 5,
|
| 227 |
"terminated_reason": "noise_exhausted",
|
| 228 |
-
"duration_seconds": 0.
|
| 229 |
},
|
| 230 |
{
|
| 231 |
-
"episode_id": "
|
| 232 |
"task_id": "security_audit",
|
| 233 |
"seed": 9,
|
| 234 |
"final_score": 0.0,
|
|
@@ -240,7 +240,7 @@
|
|
| 240 |
"duration_seconds": 0.03
|
| 241 |
},
|
| 242 |
{
|
| 243 |
-
"episode_id": "
|
| 244 |
"task_id": "architectural_review",
|
| 245 |
"seed": 0,
|
| 246 |
"final_score": 0.0,
|
|
@@ -249,10 +249,10 @@
|
|
| 249 |
"issues_total": 1,
|
| 250 |
"noise_penalties": 0,
|
| 251 |
"terminated_reason": "terminal_action",
|
| 252 |
-
"duration_seconds": 0.
|
| 253 |
},
|
| 254 |
{
|
| 255 |
-
"episode_id": "
|
| 256 |
"task_id": "architectural_review",
|
| 257 |
"seed": 1,
|
| 258 |
"final_score": 0.059,
|
|
@@ -261,10 +261,10 @@
|
|
| 261 |
"issues_total": 1,
|
| 262 |
"noise_penalties": 5,
|
| 263 |
"terminated_reason": "noise_exhausted",
|
| 264 |
-
"duration_seconds": 0.
|
| 265 |
},
|
| 266 |
{
|
| 267 |
-
"episode_id": "
|
| 268 |
"task_id": "architectural_review",
|
| 269 |
"seed": 2,
|
| 270 |
"final_score": 0.661,
|
|
@@ -273,10 +273,10 @@
|
|
| 273 |
"issues_total": 1,
|
| 274 |
"noise_penalties": 5,
|
| 275 |
"terminated_reason": "noise_exhausted",
|
| 276 |
-
"duration_seconds": 0.
|
| 277 |
},
|
| 278 |
{
|
| 279 |
-
"episode_id": "
|
| 280 |
"task_id": "architectural_review",
|
| 281 |
"seed": 3,
|
| 282 |
"final_score": 0.658,
|
|
@@ -288,7 +288,7 @@
|
|
| 288 |
"duration_seconds": 0.03
|
| 289 |
},
|
| 290 |
{
|
| 291 |
-
"episode_id": "
|
| 292 |
"task_id": "architectural_review",
|
| 293 |
"seed": 4,
|
| 294 |
"final_score": 0.058,
|
|
@@ -300,7 +300,7 @@
|
|
| 300 |
"duration_seconds": 0.03
|
| 301 |
},
|
| 302 |
{
|
| 303 |
-
"episode_id": "
|
| 304 |
"task_id": "architectural_review",
|
| 305 |
"seed": 5,
|
| 306 |
"final_score": 0.657,
|
|
@@ -309,10 +309,10 @@
|
|
| 309 |
"issues_total": 1,
|
| 310 |
"noise_penalties": 5,
|
| 311 |
"terminated_reason": "noise_exhausted",
|
| 312 |
-
"duration_seconds": 0.
|
| 313 |
},
|
| 314 |
{
|
| 315 |
-
"episode_id": "
|
| 316 |
"task_id": "architectural_review",
|
| 317 |
"seed": 6,
|
| 318 |
"final_score": 0.059,
|
|
@@ -324,7 +324,7 @@
|
|
| 324 |
"duration_seconds": 0.03
|
| 325 |
},
|
| 326 |
{
|
| 327 |
-
"episode_id": "
|
| 328 |
"task_id": "architectural_review",
|
| 329 |
"seed": 7,
|
| 330 |
"final_score": 0.664,
|
|
@@ -333,10 +333,10 @@
|
|
| 333 |
"issues_total": 1,
|
| 334 |
"noise_penalties": 5,
|
| 335 |
"terminated_reason": "noise_exhausted",
|
| 336 |
-
"duration_seconds": 0.
|
| 337 |
},
|
| 338 |
{
|
| 339 |
-
"episode_id": "
|
| 340 |
"task_id": "architectural_review",
|
| 341 |
"seed": 8,
|
| 342 |
"final_score": 0.039,
|
|
@@ -345,10 +345,10 @@
|
|
| 345 |
"issues_total": 1,
|
| 346 |
"noise_penalties": 5,
|
| 347 |
"terminated_reason": "noise_exhausted",
|
| 348 |
-
"duration_seconds": 0.
|
| 349 |
},
|
| 350 |
{
|
| 351 |
-
"episode_id": "
|
| 352 |
"task_id": "architectural_review",
|
| 353 |
"seed": 9,
|
| 354 |
"final_score": 0.075,
|
|
@@ -357,6 +357,6 @@
|
|
| 357 |
"issues_total": 1,
|
| 358 |
"noise_penalties": 5,
|
| 359 |
"terminated_reason": "noise_exhausted",
|
| 360 |
-
"duration_seconds": 0.
|
| 361 |
}
|
| 362 |
]
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"episode_id": "27c65a00-9315-4d7d-b457-f8d64b0da466",
|
| 4 |
"task_id": "bug_detection",
|
| 5 |
"seed": 0,
|
| 6 |
"final_score": 0.0,
|
|
|
|
| 9 |
"issues_total": 1,
|
| 10 |
"noise_penalties": 0,
|
| 11 |
"terminated_reason": "terminal_action",
|
| 12 |
+
"duration_seconds": 0.02
|
| 13 |
},
|
| 14 |
{
|
| 15 |
+
"episode_id": "8bdcee56-8a56-4ec8-88fc-a70e78ab48f2",
|
| 16 |
"task_id": "bug_detection",
|
| 17 |
"seed": 1,
|
| 18 |
"final_score": 0.0,
|
|
|
|
| 24 |
"duration_seconds": 0.02
|
| 25 |
},
|
| 26 |
{
|
| 27 |
+
"episode_id": "d3dcda88-ce7e-4965-9409-4e97c98cf444",
|
| 28 |
"task_id": "bug_detection",
|
| 29 |
"seed": 2,
|
| 30 |
"final_score": 0.9167,
|
|
|
|
| 33 |
"issues_total": 1,
|
| 34 |
"noise_penalties": 5,
|
| 35 |
"terminated_reason": "noise_exhausted",
|
| 36 |
+
"duration_seconds": 0.04
|
| 37 |
},
|
| 38 |
{
|
| 39 |
+
"episode_id": "822dfb7d-d67b-4dc9-9cfe-866665a9e5b9",
|
| 40 |
"task_id": "bug_detection",
|
| 41 |
"seed": 3,
|
| 42 |
"final_score": 0.9167,
|
|
|
|
| 45 |
"issues_total": 1,
|
| 46 |
"noise_penalties": 5,
|
| 47 |
"terminated_reason": "noise_exhausted",
|
| 48 |
+
"duration_seconds": 0.03
|
| 49 |
},
|
| 50 |
{
|
| 51 |
+
"episode_id": "c35c2096-b7b8-4a54-b0c7-d8fa1e1538bf",
|
| 52 |
"task_id": "bug_detection",
|
| 53 |
"seed": 4,
|
| 54 |
"final_score": 0.8267,
|
|
|
|
| 60 |
"duration_seconds": 0.03
|
| 61 |
},
|
| 62 |
{
|
| 63 |
+
"episode_id": "9b4e5191-57da-4b10-b31e-5b8236b9b1f4",
|
| 64 |
"task_id": "bug_detection",
|
| 65 |
"seed": 5,
|
| 66 |
"final_score": 0.0,
|
|
|
|
| 69 |
"issues_total": 1,
|
| 70 |
"noise_penalties": 0,
|
| 71 |
"terminated_reason": "terminal_action",
|
| 72 |
+
"duration_seconds": 0.01
|
| 73 |
},
|
| 74 |
{
|
| 75 |
+
"episode_id": "c1e64a16-512a-4716-b0f9-5d9fe14a142d",
|
| 76 |
"task_id": "bug_detection",
|
| 77 |
"seed": 6,
|
| 78 |
"final_score": 0.0,
|
|
|
|
| 84 |
"duration_seconds": 0.02
|
| 85 |
},
|
| 86 |
{
|
| 87 |
+
"episode_id": "cc8bd066-506e-46de-ba0f-0a446f045945",
|
| 88 |
"task_id": "bug_detection",
|
| 89 |
"seed": 7,
|
| 90 |
"final_score": 0.0,
|
|
|
|
| 96 |
"duration_seconds": 0.02
|
| 97 |
},
|
| 98 |
{
|
| 99 |
+
"episode_id": "20dd7158-a49f-4b6c-adcd-6b76b4274e94",
|
| 100 |
"task_id": "bug_detection",
|
| 101 |
"seed": 8,
|
| 102 |
"final_score": 0.9167,
|
|
|
|
| 105 |
"issues_total": 1,
|
| 106 |
"noise_penalties": 5,
|
| 107 |
"terminated_reason": "noise_exhausted",
|
| 108 |
+
"duration_seconds": 0.03
|
| 109 |
},
|
| 110 |
{
|
| 111 |
+
"episode_id": "14b9d1a5-03b7-45b2-8b23-b6c387bced5b",
|
| 112 |
"task_id": "bug_detection",
|
| 113 |
"seed": 9,
|
| 114 |
"final_score": 0.0,
|
|
|
|
| 120 |
"duration_seconds": 0.03
|
| 121 |
},
|
| 122 |
{
|
| 123 |
+
"episode_id": "2b1c4b7f-cf05-4312-89d2-49883b9ed6c1",
|
| 124 |
"task_id": "security_audit",
|
| 125 |
"seed": 0,
|
| 126 |
"final_score": 0.0,
|
|
|
|
| 132 |
"duration_seconds": 0.03
|
| 133 |
},
|
| 134 |
{
|
| 135 |
+
"episode_id": "47145114-7a99-4c81-baa9-1f9c4221f48a",
|
| 136 |
"task_id": "security_audit",
|
| 137 |
"seed": 1,
|
| 138 |
"final_score": 1.0,
|
|
|
|
| 141 |
"issues_total": 1,
|
| 142 |
"noise_penalties": 5,
|
| 143 |
"terminated_reason": "noise_exhausted",
|
| 144 |
+
"duration_seconds": 0.03
|
| 145 |
},
|
| 146 |
{
|
| 147 |
+
"episode_id": "5d9d3f74-a127-46b2-a331-45f30f0d2f6f",
|
| 148 |
"task_id": "security_audit",
|
| 149 |
"seed": 2,
|
| 150 |
"final_score": 0.0,
|
|
|
|
| 156 |
"duration_seconds": 0.03
|
| 157 |
},
|
| 158 |
{
|
| 159 |
+
"episode_id": "f383afc4-df2d-4a60-abd1-a583cb2de538",
|
| 160 |
"task_id": "security_audit",
|
| 161 |
"seed": 3,
|
| 162 |
"final_score": 0.85,
|
|
|
|
| 165 |
"issues_total": 1,
|
| 166 |
"noise_penalties": 5,
|
| 167 |
"terminated_reason": "noise_exhausted",
|
| 168 |
+
"duration_seconds": 0.03
|
| 169 |
},
|
| 170 |
{
|
| 171 |
+
"episode_id": "df23e18b-c21a-4390-9afa-f77eb1b8050b",
|
| 172 |
"task_id": "security_audit",
|
| 173 |
"seed": 4,
|
| 174 |
"final_score": 0.0,
|
|
|
|
| 180 |
"duration_seconds": 0.03
|
| 181 |
},
|
| 182 |
{
|
| 183 |
+
"episode_id": "bcec83af-4aaf-4997-922a-1556ca63fcf3",
|
| 184 |
"task_id": "security_audit",
|
| 185 |
"seed": 5,
|
| 186 |
"final_score": 0.0,
|
|
|
|
| 189 |
"issues_total": 1,
|
| 190 |
"noise_penalties": 5,
|
| 191 |
"terminated_reason": "noise_exhausted",
|
| 192 |
+
"duration_seconds": 0.03
|
| 193 |
},
|
| 194 |
{
|
| 195 |
+
"episode_id": "65afedd0-9ad1-45f7-9615-dafeaa390338",
|
| 196 |
"task_id": "security_audit",
|
| 197 |
"seed": 6,
|
| 198 |
"final_score": 0.0,
|
|
|
|
| 204 |
"duration_seconds": 0.03
|
| 205 |
},
|
| 206 |
{
|
| 207 |
+
"episode_id": "0e8b65e5-d59c-41fb-8038-1f3b1d4b657b",
|
| 208 |
"task_id": "security_audit",
|
| 209 |
"seed": 7,
|
| 210 |
"final_score": 0.0,
|
|
|
|
| 216 |
"duration_seconds": 0.03
|
| 217 |
},
|
| 218 |
{
|
| 219 |
+
"episode_id": "ae477e21-2718-477e-8a11-e46bda49bea7",
|
| 220 |
"task_id": "security_audit",
|
| 221 |
"seed": 8,
|
| 222 |
"final_score": 0.0,
|
|
|
|
| 225 |
"issues_total": 1,
|
| 226 |
"noise_penalties": 5,
|
| 227 |
"terminated_reason": "noise_exhausted",
|
| 228 |
+
"duration_seconds": 0.04
|
| 229 |
},
|
| 230 |
{
|
| 231 |
+
"episode_id": "0cd3d18c-cdf6-4752-9dc8-cade848cee0e",
|
| 232 |
"task_id": "security_audit",
|
| 233 |
"seed": 9,
|
| 234 |
"final_score": 0.0,
|
|
|
|
| 240 |
"duration_seconds": 0.03
|
| 241 |
},
|
| 242 |
{
|
| 243 |
+
"episode_id": "d9f79316-57ea-42ad-b191-797ea895951b",
|
| 244 |
"task_id": "architectural_review",
|
| 245 |
"seed": 0,
|
| 246 |
"final_score": 0.0,
|
|
|
|
| 249 |
"issues_total": 1,
|
| 250 |
"noise_penalties": 0,
|
| 251 |
"terminated_reason": "terminal_action",
|
| 252 |
+
"duration_seconds": 0.01
|
| 253 |
},
|
| 254 |
{
|
| 255 |
+
"episode_id": "65112da3-d319-4eb3-9774-1c43313fe1ec",
|
| 256 |
"task_id": "architectural_review",
|
| 257 |
"seed": 1,
|
| 258 |
"final_score": 0.059,
|
|
|
|
| 261 |
"issues_total": 1,
|
| 262 |
"noise_penalties": 5,
|
| 263 |
"terminated_reason": "noise_exhausted",
|
| 264 |
+
"duration_seconds": 0.03
|
| 265 |
},
|
| 266 |
{
|
| 267 |
+
"episode_id": "c1d6bd63-1abf-403b-97be-cf1962959910",
|
| 268 |
"task_id": "architectural_review",
|
| 269 |
"seed": 2,
|
| 270 |
"final_score": 0.661,
|
|
|
|
| 273 |
"issues_total": 1,
|
| 274 |
"noise_penalties": 5,
|
| 275 |
"terminated_reason": "noise_exhausted",
|
| 276 |
+
"duration_seconds": 0.03
|
| 277 |
},
|
| 278 |
{
|
| 279 |
+
"episode_id": "fab56551-f2ae-42be-88eb-854ef236b29a",
|
| 280 |
"task_id": "architectural_review",
|
| 281 |
"seed": 3,
|
| 282 |
"final_score": 0.658,
|
|
|
|
| 288 |
"duration_seconds": 0.03
|
| 289 |
},
|
| 290 |
{
|
| 291 |
+
"episode_id": "8f4b8fa9-f281-4bb2-ae71-7ad6c7ca7fc9",
|
| 292 |
"task_id": "architectural_review",
|
| 293 |
"seed": 4,
|
| 294 |
"final_score": 0.058,
|
|
|
|
| 300 |
"duration_seconds": 0.03
|
| 301 |
},
|
| 302 |
{
|
| 303 |
+
"episode_id": "7eaa445d-774d-44b1-80d9-06e978b022df",
|
| 304 |
"task_id": "architectural_review",
|
| 305 |
"seed": 5,
|
| 306 |
"final_score": 0.657,
|
|
|
|
| 309 |
"issues_total": 1,
|
| 310 |
"noise_penalties": 5,
|
| 311 |
"terminated_reason": "noise_exhausted",
|
| 312 |
+
"duration_seconds": 0.03
|
| 313 |
},
|
| 314 |
{
|
| 315 |
+
"episode_id": "103d6e2b-e40d-49ff-8f36-6910d463b48b",
|
| 316 |
"task_id": "architectural_review",
|
| 317 |
"seed": 6,
|
| 318 |
"final_score": 0.059,
|
|
|
|
| 324 |
"duration_seconds": 0.03
|
| 325 |
},
|
| 326 |
{
|
| 327 |
+
"episode_id": "5206e9d8-0ee0-482c-ad3d-34adf2ce57a7",
|
| 328 |
"task_id": "architectural_review",
|
| 329 |
"seed": 7,
|
| 330 |
"final_score": 0.664,
|
|
|
|
| 333 |
"issues_total": 1,
|
| 334 |
"noise_penalties": 5,
|
| 335 |
"terminated_reason": "noise_exhausted",
|
| 336 |
+
"duration_seconds": 0.03
|
| 337 |
},
|
| 338 |
{
|
| 339 |
+
"episode_id": "c147abd7-f887-44a2-b166-5382e292d793",
|
| 340 |
"task_id": "architectural_review",
|
| 341 |
"seed": 8,
|
| 342 |
"final_score": 0.039,
|
|
|
|
| 345 |
"issues_total": 1,
|
| 346 |
"noise_penalties": 5,
|
| 347 |
"terminated_reason": "noise_exhausted",
|
| 348 |
+
"duration_seconds": 0.08
|
| 349 |
},
|
| 350 |
{
|
| 351 |
+
"episode_id": "6fe54153-a878-47d7-a521-5cd70f55f6b8",
|
| 352 |
"task_id": "architectural_review",
|
| 353 |
"seed": 9,
|
| 354 |
"final_score": 0.075,
|
|
|
|
| 357 |
"issues_total": 1,
|
| 358 |
"noise_penalties": 5,
|
| 359 |
"terminated_reason": "noise_exhausted",
|
| 360 |
+
"duration_seconds": 0.09
|
| 361 |
}
|
| 362 |
]
|
scripts/reset_db.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Reset the CodeLens database: deletes the SQLite file and re-initializes tables.
|
| 4 |
+
Useful for clearing test data and starting fresh evaluation benchmarks.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import sys
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
# Add project root to path
|
| 12 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 13 |
+
|
| 14 |
+
from codelens_env.config import get_settings
|
| 15 |
+
from codelens_env.database import create_db_and_tables
|
| 16 |
+
|
| 17 |
+
def reset_db():
|
| 18 |
+
settings = get_settings()
|
| 19 |
+
db_path = Path(settings.db_path)
|
| 20 |
+
|
| 21 |
+
# 1. Delete existing database file
|
| 22 |
+
if db_path.exists():
|
| 23 |
+
print(f"Removing existing database at: {db_path}")
|
| 24 |
+
try:
|
| 25 |
+
os.remove(db_path)
|
| 26 |
+
print("Successfully deleted old records.")
|
| 27 |
+
except Exception as e:
|
| 28 |
+
print(f"Error deleting file: {e}")
|
| 29 |
+
sys.exit(1)
|
| 30 |
+
else:
|
| 31 |
+
print(f"No existing database found at {db_path}")
|
| 32 |
+
|
| 33 |
+
# 2. Re-initialize
|
| 34 |
+
print(f"Re-initializing schema...")
|
| 35 |
+
try:
|
| 36 |
+
create_db_and_tables()
|
| 37 |
+
print("Database reset successfully. You now have a clean dashboard.")
|
| 38 |
+
except Exception as e:
|
| 39 |
+
print(f"Error re-initializing: {e}")
|
| 40 |
+
sys.exit(1)
|
| 41 |
+
|
| 42 |
+
if __name__ == "__main__":
|
| 43 |
+
confirm = input("This will permanently delete all leaderboard and episode data. Proceed? [y/N]: ")
|
| 44 |
+
if confirm.lower() == 'y':
|
| 45 |
+
reset_db()
|
| 46 |
+
else:
|
| 47 |
+
print("Reset aborted.")
|
tests/conftest.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
| 1 |
import pytest
|
|
|
|
|
|
|
| 2 |
from fastapi.testclient import TestClient
|
| 3 |
from sqlmodel import SQLModel, Session, create_engine
|
| 4 |
from sqlmodel.pool import StaticPool
|
|
|
|
| 1 |
import pytest
|
| 2 |
+
import os
|
| 3 |
+
os.environ["TESTING"] = "true"
|
| 4 |
from fastapi.testclient import TestClient
|
| 5 |
from sqlmodel import SQLModel, Session, create_engine
|
| 6 |
from sqlmodel.pool import StaticPool
|
tests/test_api.py
CHANGED
|
@@ -3,15 +3,13 @@ from fastapi.testclient import TestClient
|
|
| 3 |
from app import app
|
| 4 |
from codelens_env.models import TaskId, ActionType, Category, Severity, Verdict
|
| 5 |
|
| 6 |
-
def test_api_health():
|
| 7 |
-
client = TestClient(app)
|
| 8 |
response = client.get("/health")
|
| 9 |
assert response.status_code == 200
|
| 10 |
assert response.json()["status"] == "ok"
|
| 11 |
assert response.json()["env_ready"] is True
|
| 12 |
|
| 13 |
-
def test_api_workflow():
|
| 14 |
-
client = TestClient(app)
|
| 15 |
|
| 16 |
# 1. Reset
|
| 17 |
reset_resp = client.post("/reset", json={"task_id": "bug_detection", "seed": 1})
|
|
@@ -34,8 +32,7 @@ def test_api_workflow():
|
|
| 34 |
assert result_resp.status_code == 200
|
| 35 |
assert result_resp.json()["final_score"] >= 0
|
| 36 |
|
| 37 |
-
def test_api_leaderboard():
|
| 38 |
-
client = TestClient(app)
|
| 39 |
# Submit a score
|
| 40 |
sub = {
|
| 41 |
"agent_name": "test_agent",
|
|
@@ -55,8 +52,7 @@ def test_api_leaderboard():
|
|
| 55 |
assert len(bug_entries) > 0
|
| 56 |
assert bug_entries[0]["agent_name"] == "test_agent"
|
| 57 |
|
| 58 |
-
def test_api_invalid_episode():
|
| 59 |
-
client = TestClient(app)
|
| 60 |
response = client.post("/step/nonexistent-id", json={
|
| 61 |
"action_type": "comment",
|
| 62 |
"body": "hello"
|
|
|
|
| 3 |
from app import app
|
| 4 |
from codelens_env.models import TaskId, ActionType, Category, Severity, Verdict
|
| 5 |
|
| 6 |
+
def test_api_health(client):
|
|
|
|
| 7 |
response = client.get("/health")
|
| 8 |
assert response.status_code == 200
|
| 9 |
assert response.json()["status"] == "ok"
|
| 10 |
assert response.json()["env_ready"] is True
|
| 11 |
|
| 12 |
+
def test_api_workflow(client):
|
|
|
|
| 13 |
|
| 14 |
# 1. Reset
|
| 15 |
reset_resp = client.post("/reset", json={"task_id": "bug_detection", "seed": 1})
|
|
|
|
| 32 |
assert result_resp.status_code == 200
|
| 33 |
assert result_resp.json()["final_score"] >= 0
|
| 34 |
|
| 35 |
+
def test_api_leaderboard(client):
|
|
|
|
| 36 |
# Submit a score
|
| 37 |
sub = {
|
| 38 |
"agent_name": "test_agent",
|
|
|
|
| 52 |
assert len(bug_entries) > 0
|
| 53 |
assert bug_entries[0]["agent_name"] == "test_agent"
|
| 54 |
|
| 55 |
+
def test_api_invalid_episode(client):
|
|
|
|
| 56 |
response = client.post("/step/nonexistent-id", json={
|
| 57 |
"action_type": "comment",
|
| 58 |
"body": "hello"
|