ArshVerma commited on
Commit
c90ac2d
·
1 Parent(s): c1972ef

Skip DB init in TESTING; add reset_db

Browse files

Avoid initializing the SQLite DB during tests by checking the TESTING env var in app.lifespan and logging accordingly. Add scripts/reset_db.py to remove and re-create the DB file for fresh state during development/benchmarks. Update tests: conftest sets TESTING=true and test_api uses a shared client fixture instead of creating TestClient per test. Update results.json sample data (IDs and durations) as part of test/sample data refresh.

DEPLOYMENT.md ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CodeLens. Deployment Guide (Production)
2
+
3
+ Follow this guide to deploy **CodeLens. v1.0.0** to the professional cloud. This configuration uses **Vercel** for the frontend, **Render** for the backend, and **Supabase/Neon** for the PostgreSQL database.
4
+
5
+ ---
6
+
7
+ ## 1. 🗄️ Setup the Database (PostgreSQL)
8
+
9
+ Since SQLite is disk-based and will be deleted at every restart on Render/Vercel, you **must** use a managed PostgreSQL service.
10
+
11
+ 1. **Go to [Supabase](https://supabase.com)** or [Neon](https://neon.tech).
12
+ 2. **Create a new Project** called "CodeLens".
13
+ 3. **Copy your Connection String** (it should look like `postgres://user:pass@host:5432/dbname`).
14
+ 4. **Important**: Keep this URL safe—it is your `DATABASE_URL`.
15
+
16
+ ---
17
+
18
+ ## 2. 🚀 Setup the Backend (Render)
19
+
20
+ Render will host your FastAPI API and your Dockerized environment.
21
+
22
+ 1. **Go to [Render Dashboard](https://dashboard.render.com)**.
23
+ 2. **New -> Web Service** and connect your GitHub repository.
24
+ 3. **Configure**:
25
+ - **Runtime**: `Docker`.
26
+ - **Environment Variables**:
27
+ - `DATABASE_URL`: (Paste your Supabase/Neon URL here).
28
+ - `API_KEY_ENABLED`: `true` (highly recommended for production).
29
+ - `API_KEY`: A strong secret password.
30
+ - `APP_ENV`: `production`.
31
+ 4. **Deploy**: Render will automatically build the `Dockerfile` in the root and start the service.
32
+ 5. **Identify**: Copy your Render URL (e.g., `https://codelens-api.onrender.com`).
33
+
34
+ ---
35
+
36
+ ## 3. 🎨 Setup the Frontend (Vercel)
37
+
38
+ Vercel will host your React/Vite dashboard.
39
+
40
+ 1. **Go to [Vercel](https://vercel.com)**.
41
+ 2. **Import** your `dashboard` folder (or the whole repository and set the root directory to `dashboard`).
42
+ 3. **Update `vercel.json`**:
43
+ - Open [`dashboard/vercel.json`](file:///Users/arshverma/GitHub/open-ev-code-handler/dashboard/vercel.json).
44
+ - Replace `https://YOUR_BACKEND_URL.render.com` with your **real** Render URL.
45
+ 4. **Deploy**: Vercel will build the React application and provide a global dashboard link.
46
+
47
+ ---
48
+
49
+ ## 4. 🤖 Running Remote Evaluations
50
+
51
+ Once deployed, you can run the benchmark script from your local machine (or any CI) against your **production** instance:
52
+
53
+ ```bash
54
+ python scripts/evaluate.py --url https://your-render-url.com --api-key YOUR_SECRET_KEY
55
+ ```
56
+
57
+ ---
58
+
59
+ > [!CAUTION]
60
+ > **Database Migrations**: When you first deploy to a new PostgreSQL instance, the tables will be empty. The first request to the API will automatically trigger `create_db_and_tables()` via the lifespan hook—no manual SQL is required.
61
+
62
+ > [!TIP]
63
+ > **Vercel Rewrites**: The `vercel.json` rewrite rule is what allows the frontend to talk to the backend without CORS issues. Ensure the URL is exactly correct.
app.py CHANGED
@@ -42,9 +42,13 @@ logger = logging.getLogger("codelens_env")
42
  @asynccontextmanager
43
  async def lifespan(app: FastAPI):
44
  # Startup
45
- create_db_and_tables()
 
 
 
 
 
46
  cleanup_task = asyncio.create_task(cleanup_expired_episodes())
47
- logger.info(f"CodeLens API started — DB at {settings.db_path}")
48
 
49
  yield
50
 
 
42
  @asynccontextmanager
43
  async def lifespan(app: FastAPI):
44
  # Startup
45
+ if not os.getenv("TESTING"):
46
+ create_db_and_tables()
47
+ logger.info(f"CodeLens API started — DB at {settings.db_path}")
48
+ else:
49
+ logger.info("CodeLens API running in TESTING mode — DB initialization skipped")
50
+
51
  cleanup_task = asyncio.create_task(cleanup_expired_episodes())
 
52
 
53
  yield
54
 
codelens_env/config.py CHANGED
@@ -1,4 +1,5 @@
1
  from functools import lru_cache
 
2
  from pydantic_settings import BaseSettings, SettingsConfigDict
3
 
4
  class Settings(BaseSettings):
@@ -19,6 +20,7 @@ class Settings(BaseSettings):
19
  rate_limit_per_minute: int = 60 # requests per minute per IP
20
 
21
  # Persistence
 
22
  db_path: str = "./data/codelens.db"
23
  db_echo: bool = False # Set True to log all SQL queries
24
 
 
1
  from functools import lru_cache
2
+ from typing import Optional
3
  from pydantic_settings import BaseSettings, SettingsConfigDict
4
 
5
  class Settings(BaseSettings):
 
20
  rate_limit_per_minute: int = 60 # requests per minute per IP
21
 
22
  # Persistence
23
+ database_url: Optional[str] = None
24
  db_path: str = "./data/codelens.db"
25
  db_echo: bool = False # Set True to log all SQL queries
26
 
codelens_env/database.py CHANGED
@@ -7,6 +7,20 @@ from codelens_env.models import EpisodeResult, TaskId
7
 
8
  def get_engine():
9
  settings = get_settings()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  Path(settings.db_path).parent.mkdir(parents=True, exist_ok=True)
11
  return create_engine(
12
  f"sqlite:///{settings.db_path}",
 
7
 
8
  def get_engine():
9
  settings = get_settings()
10
+
11
+ if settings.database_url:
12
+ # Support Render/Heroku 'postgres://' URLs by converting to 'postgresql://'
13
+ url = settings.database_url
14
+ if url.startswith("postgres://"):
15
+ url = url.replace("postgres://", "postgresql://", 1)
16
+
17
+ return create_engine(
18
+ url,
19
+ echo=settings.db_echo,
20
+ pool_pre_ping=True, # Ensure connections are alive
21
+ )
22
+
23
+ # Fallback to local SQLite
24
  Path(settings.db_path).parent.mkdir(parents=True, exist_ok=True)
25
  return create_engine(
26
  f"sqlite:///{settings.db_path}",
dashboard/vercel.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "rewrites": [
3
+ {
4
+ "source": "/api/(.*)",
5
+ "destination": "https://YOUR_BACKEND_URL.render.com/$1"
6
+ },
7
+ {
8
+ "source": "/ws/(.*)",
9
+ "destination": "wss://YOUR_BACKEND_URL.render.com/ws/$1"
10
+ }
11
+ ],
12
+ "framework": "vite",
13
+ "buildCommand": "npm run build",
14
+ "outputDirectory": "dist"
15
+ }
results.json CHANGED
@@ -1,6 +1,6 @@
1
  [
2
  {
3
- "episode_id": "0d8d0a22-2a81-48a1-8dbe-c9662da7dc2b",
4
  "task_id": "bug_detection",
5
  "seed": 0,
6
  "final_score": 0.0,
@@ -9,10 +9,10 @@
9
  "issues_total": 1,
10
  "noise_penalties": 0,
11
  "terminated_reason": "terminal_action",
12
- "duration_seconds": 0.03
13
  },
14
  {
15
- "episode_id": "6ddc5a56-86c1-43f2-8839-b6ad9eac2ad9",
16
  "task_id": "bug_detection",
17
  "seed": 1,
18
  "final_score": 0.0,
@@ -24,7 +24,7 @@
24
  "duration_seconds": 0.02
25
  },
26
  {
27
- "episode_id": "df00496e-0e19-4049-82d9-6a0f2a1e8f2a",
28
  "task_id": "bug_detection",
29
  "seed": 2,
30
  "final_score": 0.9167,
@@ -33,10 +33,10 @@
33
  "issues_total": 1,
34
  "noise_penalties": 5,
35
  "terminated_reason": "noise_exhausted",
36
- "duration_seconds": 0.06
37
  },
38
  {
39
- "episode_id": "26837a97-d3ad-4fd7-8e74-d0d713b9b137",
40
  "task_id": "bug_detection",
41
  "seed": 3,
42
  "final_score": 0.9167,
@@ -45,10 +45,10 @@
45
  "issues_total": 1,
46
  "noise_penalties": 5,
47
  "terminated_reason": "noise_exhausted",
48
- "duration_seconds": 0.04
49
  },
50
  {
51
- "episode_id": "41b3e01d-5498-42a1-a3f9-44bb87f542b6",
52
  "task_id": "bug_detection",
53
  "seed": 4,
54
  "final_score": 0.8267,
@@ -60,7 +60,7 @@
60
  "duration_seconds": 0.03
61
  },
62
  {
63
- "episode_id": "8331b8bf-f397-49be-bc07-62bb0bdddd5c",
64
  "task_id": "bug_detection",
65
  "seed": 5,
66
  "final_score": 0.0,
@@ -69,10 +69,10 @@
69
  "issues_total": 1,
70
  "noise_penalties": 0,
71
  "terminated_reason": "terminal_action",
72
- "duration_seconds": 0.02
73
  },
74
  {
75
- "episode_id": "22341ab4-ea27-416b-ba40-53f93b6090c9",
76
  "task_id": "bug_detection",
77
  "seed": 6,
78
  "final_score": 0.0,
@@ -84,7 +84,7 @@
84
  "duration_seconds": 0.02
85
  },
86
  {
87
- "episode_id": "9fc11809-6ba4-4bf6-b00e-9b8ffc61218f",
88
  "task_id": "bug_detection",
89
  "seed": 7,
90
  "final_score": 0.0,
@@ -96,7 +96,7 @@
96
  "duration_seconds": 0.02
97
  },
98
  {
99
- "episode_id": "dce54792-7fba-4e8c-8feb-404aa655172f",
100
  "task_id": "bug_detection",
101
  "seed": 8,
102
  "final_score": 0.9167,
@@ -105,10 +105,10 @@
105
  "issues_total": 1,
106
  "noise_penalties": 5,
107
  "terminated_reason": "noise_exhausted",
108
- "duration_seconds": 0.04
109
  },
110
  {
111
- "episode_id": "be559f48-b216-49b9-8f3d-ad9b50da3751",
112
  "task_id": "bug_detection",
113
  "seed": 9,
114
  "final_score": 0.0,
@@ -120,7 +120,7 @@
120
  "duration_seconds": 0.03
121
  },
122
  {
123
- "episode_id": "b2029cb1-c72e-438a-bd96-98a0aa9bc141",
124
  "task_id": "security_audit",
125
  "seed": 0,
126
  "final_score": 0.0,
@@ -132,7 +132,7 @@
132
  "duration_seconds": 0.03
133
  },
134
  {
135
- "episode_id": "e383f50e-5797-40d9-9f54-aa4142247a6e",
136
  "task_id": "security_audit",
137
  "seed": 1,
138
  "final_score": 1.0,
@@ -141,10 +141,10 @@
141
  "issues_total": 1,
142
  "noise_penalties": 5,
143
  "terminated_reason": "noise_exhausted",
144
- "duration_seconds": 0.04
145
  },
146
  {
147
- "episode_id": "6560439d-9050-4c94-96b1-c31111081c2e",
148
  "task_id": "security_audit",
149
  "seed": 2,
150
  "final_score": 0.0,
@@ -156,7 +156,7 @@
156
  "duration_seconds": 0.03
157
  },
158
  {
159
- "episode_id": "d9a9d12d-384c-4dd2-86cb-b55ea7604e60",
160
  "task_id": "security_audit",
161
  "seed": 3,
162
  "final_score": 0.85,
@@ -165,10 +165,10 @@
165
  "issues_total": 1,
166
  "noise_penalties": 5,
167
  "terminated_reason": "noise_exhausted",
168
- "duration_seconds": 0.04
169
  },
170
  {
171
- "episode_id": "463b8d96-ad63-472f-ad93-f20c99afe96e",
172
  "task_id": "security_audit",
173
  "seed": 4,
174
  "final_score": 0.0,
@@ -180,7 +180,7 @@
180
  "duration_seconds": 0.03
181
  },
182
  {
183
- "episode_id": "15e9c92a-559a-4ba6-b75b-b12ac6055f91",
184
  "task_id": "security_audit",
185
  "seed": 5,
186
  "final_score": 0.0,
@@ -189,10 +189,10 @@
189
  "issues_total": 1,
190
  "noise_penalties": 5,
191
  "terminated_reason": "noise_exhausted",
192
- "duration_seconds": 0.06
193
  },
194
  {
195
- "episode_id": "38bd8513-7511-4a2f-8642-fc23f50b61bd",
196
  "task_id": "security_audit",
197
  "seed": 6,
198
  "final_score": 0.0,
@@ -204,7 +204,7 @@
204
  "duration_seconds": 0.03
205
  },
206
  {
207
- "episode_id": "8a6e48fd-1b17-475f-b495-a1cd8ce2e70c",
208
  "task_id": "security_audit",
209
  "seed": 7,
210
  "final_score": 0.0,
@@ -216,7 +216,7 @@
216
  "duration_seconds": 0.03
217
  },
218
  {
219
- "episode_id": "3ec40833-ce55-4a63-ad90-55b2e35370d1",
220
  "task_id": "security_audit",
221
  "seed": 8,
222
  "final_score": 0.0,
@@ -225,10 +225,10 @@
225
  "issues_total": 1,
226
  "noise_penalties": 5,
227
  "terminated_reason": "noise_exhausted",
228
- "duration_seconds": 0.03
229
  },
230
  {
231
- "episode_id": "87fa57a3-27f3-4f9b-a0ad-008ff9b4c7f2",
232
  "task_id": "security_audit",
233
  "seed": 9,
234
  "final_score": 0.0,
@@ -240,7 +240,7 @@
240
  "duration_seconds": 0.03
241
  },
242
  {
243
- "episode_id": "8a878e69-51c2-4992-8276-616cbc798efd",
244
  "task_id": "architectural_review",
245
  "seed": 0,
246
  "final_score": 0.0,
@@ -249,10 +249,10 @@
249
  "issues_total": 1,
250
  "noise_penalties": 0,
251
  "terminated_reason": "terminal_action",
252
- "duration_seconds": 0.02
253
  },
254
  {
255
- "episode_id": "c85766aa-d231-4998-8d95-b4dbbfd5aea6",
256
  "task_id": "architectural_review",
257
  "seed": 1,
258
  "final_score": 0.059,
@@ -261,10 +261,10 @@
261
  "issues_total": 1,
262
  "noise_penalties": 5,
263
  "terminated_reason": "noise_exhausted",
264
- "duration_seconds": 0.04
265
  },
266
  {
267
- "episode_id": "29122fdb-ed30-4a46-8b04-4aa8c2025e6c",
268
  "task_id": "architectural_review",
269
  "seed": 2,
270
  "final_score": 0.661,
@@ -273,10 +273,10 @@
273
  "issues_total": 1,
274
  "noise_penalties": 5,
275
  "terminated_reason": "noise_exhausted",
276
- "duration_seconds": 0.04
277
  },
278
  {
279
- "episode_id": "1388c816-8e96-4014-b255-81382e9353ba",
280
  "task_id": "architectural_review",
281
  "seed": 3,
282
  "final_score": 0.658,
@@ -288,7 +288,7 @@
288
  "duration_seconds": 0.03
289
  },
290
  {
291
- "episode_id": "adee24d2-cb77-48b6-b00b-72c39d7d7c7e",
292
  "task_id": "architectural_review",
293
  "seed": 4,
294
  "final_score": 0.058,
@@ -300,7 +300,7 @@
300
  "duration_seconds": 0.03
301
  },
302
  {
303
- "episode_id": "12508a15-eb46-4628-b750-cde1b5f91828",
304
  "task_id": "architectural_review",
305
  "seed": 5,
306
  "final_score": 0.657,
@@ -309,10 +309,10 @@
309
  "issues_total": 1,
310
  "noise_penalties": 5,
311
  "terminated_reason": "noise_exhausted",
312
- "duration_seconds": 0.04
313
  },
314
  {
315
- "episode_id": "520fb8ad-58fe-48fa-8a96-89e552e12924",
316
  "task_id": "architectural_review",
317
  "seed": 6,
318
  "final_score": 0.059,
@@ -324,7 +324,7 @@
324
  "duration_seconds": 0.03
325
  },
326
  {
327
- "episode_id": "86982fa4-cee7-48fc-b8d1-347a2f650f6d",
328
  "task_id": "architectural_review",
329
  "seed": 7,
330
  "final_score": 0.664,
@@ -333,10 +333,10 @@
333
  "issues_total": 1,
334
  "noise_penalties": 5,
335
  "terminated_reason": "noise_exhausted",
336
- "duration_seconds": 0.04
337
  },
338
  {
339
- "episode_id": "086e541a-776e-4ac6-b44e-1eeb76d88b4e",
340
  "task_id": "architectural_review",
341
  "seed": 8,
342
  "final_score": 0.039,
@@ -345,10 +345,10 @@
345
  "issues_total": 1,
346
  "noise_penalties": 5,
347
  "terminated_reason": "noise_exhausted",
348
- "duration_seconds": 0.03
349
  },
350
  {
351
- "episode_id": "3bcd91ce-0d0d-48c5-ab25-bc8534906f5b",
352
  "task_id": "architectural_review",
353
  "seed": 9,
354
  "final_score": 0.075,
@@ -357,6 +357,6 @@
357
  "issues_total": 1,
358
  "noise_penalties": 5,
359
  "terminated_reason": "noise_exhausted",
360
- "duration_seconds": 0.03
361
  }
362
  ]
 
1
  [
2
  {
3
+ "episode_id": "27c65a00-9315-4d7d-b457-f8d64b0da466",
4
  "task_id": "bug_detection",
5
  "seed": 0,
6
  "final_score": 0.0,
 
9
  "issues_total": 1,
10
  "noise_penalties": 0,
11
  "terminated_reason": "terminal_action",
12
+ "duration_seconds": 0.02
13
  },
14
  {
15
+ "episode_id": "8bdcee56-8a56-4ec8-88fc-a70e78ab48f2",
16
  "task_id": "bug_detection",
17
  "seed": 1,
18
  "final_score": 0.0,
 
24
  "duration_seconds": 0.02
25
  },
26
  {
27
+ "episode_id": "d3dcda88-ce7e-4965-9409-4e97c98cf444",
28
  "task_id": "bug_detection",
29
  "seed": 2,
30
  "final_score": 0.9167,
 
33
  "issues_total": 1,
34
  "noise_penalties": 5,
35
  "terminated_reason": "noise_exhausted",
36
+ "duration_seconds": 0.04
37
  },
38
  {
39
+ "episode_id": "822dfb7d-d67b-4dc9-9cfe-866665a9e5b9",
40
  "task_id": "bug_detection",
41
  "seed": 3,
42
  "final_score": 0.9167,
 
45
  "issues_total": 1,
46
  "noise_penalties": 5,
47
  "terminated_reason": "noise_exhausted",
48
+ "duration_seconds": 0.03
49
  },
50
  {
51
+ "episode_id": "c35c2096-b7b8-4a54-b0c7-d8fa1e1538bf",
52
  "task_id": "bug_detection",
53
  "seed": 4,
54
  "final_score": 0.8267,
 
60
  "duration_seconds": 0.03
61
  },
62
  {
63
+ "episode_id": "9b4e5191-57da-4b10-b31e-5b8236b9b1f4",
64
  "task_id": "bug_detection",
65
  "seed": 5,
66
  "final_score": 0.0,
 
69
  "issues_total": 1,
70
  "noise_penalties": 0,
71
  "terminated_reason": "terminal_action",
72
+ "duration_seconds": 0.01
73
  },
74
  {
75
+ "episode_id": "c1e64a16-512a-4716-b0f9-5d9fe14a142d",
76
  "task_id": "bug_detection",
77
  "seed": 6,
78
  "final_score": 0.0,
 
84
  "duration_seconds": 0.02
85
  },
86
  {
87
+ "episode_id": "cc8bd066-506e-46de-ba0f-0a446f045945",
88
  "task_id": "bug_detection",
89
  "seed": 7,
90
  "final_score": 0.0,
 
96
  "duration_seconds": 0.02
97
  },
98
  {
99
+ "episode_id": "20dd7158-a49f-4b6c-adcd-6b76b4274e94",
100
  "task_id": "bug_detection",
101
  "seed": 8,
102
  "final_score": 0.9167,
 
105
  "issues_total": 1,
106
  "noise_penalties": 5,
107
  "terminated_reason": "noise_exhausted",
108
+ "duration_seconds": 0.03
109
  },
110
  {
111
+ "episode_id": "14b9d1a5-03b7-45b2-8b23-b6c387bced5b",
112
  "task_id": "bug_detection",
113
  "seed": 9,
114
  "final_score": 0.0,
 
120
  "duration_seconds": 0.03
121
  },
122
  {
123
+ "episode_id": "2b1c4b7f-cf05-4312-89d2-49883b9ed6c1",
124
  "task_id": "security_audit",
125
  "seed": 0,
126
  "final_score": 0.0,
 
132
  "duration_seconds": 0.03
133
  },
134
  {
135
+ "episode_id": "47145114-7a99-4c81-baa9-1f9c4221f48a",
136
  "task_id": "security_audit",
137
  "seed": 1,
138
  "final_score": 1.0,
 
141
  "issues_total": 1,
142
  "noise_penalties": 5,
143
  "terminated_reason": "noise_exhausted",
144
+ "duration_seconds": 0.03
145
  },
146
  {
147
+ "episode_id": "5d9d3f74-a127-46b2-a331-45f30f0d2f6f",
148
  "task_id": "security_audit",
149
  "seed": 2,
150
  "final_score": 0.0,
 
156
  "duration_seconds": 0.03
157
  },
158
  {
159
+ "episode_id": "f383afc4-df2d-4a60-abd1-a583cb2de538",
160
  "task_id": "security_audit",
161
  "seed": 3,
162
  "final_score": 0.85,
 
165
  "issues_total": 1,
166
  "noise_penalties": 5,
167
  "terminated_reason": "noise_exhausted",
168
+ "duration_seconds": 0.03
169
  },
170
  {
171
+ "episode_id": "df23e18b-c21a-4390-9afa-f77eb1b8050b",
172
  "task_id": "security_audit",
173
  "seed": 4,
174
  "final_score": 0.0,
 
180
  "duration_seconds": 0.03
181
  },
182
  {
183
+ "episode_id": "bcec83af-4aaf-4997-922a-1556ca63fcf3",
184
  "task_id": "security_audit",
185
  "seed": 5,
186
  "final_score": 0.0,
 
189
  "issues_total": 1,
190
  "noise_penalties": 5,
191
  "terminated_reason": "noise_exhausted",
192
+ "duration_seconds": 0.03
193
  },
194
  {
195
+ "episode_id": "65afedd0-9ad1-45f7-9615-dafeaa390338",
196
  "task_id": "security_audit",
197
  "seed": 6,
198
  "final_score": 0.0,
 
204
  "duration_seconds": 0.03
205
  },
206
  {
207
+ "episode_id": "0e8b65e5-d59c-41fb-8038-1f3b1d4b657b",
208
  "task_id": "security_audit",
209
  "seed": 7,
210
  "final_score": 0.0,
 
216
  "duration_seconds": 0.03
217
  },
218
  {
219
+ "episode_id": "ae477e21-2718-477e-8a11-e46bda49bea7",
220
  "task_id": "security_audit",
221
  "seed": 8,
222
  "final_score": 0.0,
 
225
  "issues_total": 1,
226
  "noise_penalties": 5,
227
  "terminated_reason": "noise_exhausted",
228
+ "duration_seconds": 0.04
229
  },
230
  {
231
+ "episode_id": "0cd3d18c-cdf6-4752-9dc8-cade848cee0e",
232
  "task_id": "security_audit",
233
  "seed": 9,
234
  "final_score": 0.0,
 
240
  "duration_seconds": 0.03
241
  },
242
  {
243
+ "episode_id": "d9f79316-57ea-42ad-b191-797ea895951b",
244
  "task_id": "architectural_review",
245
  "seed": 0,
246
  "final_score": 0.0,
 
249
  "issues_total": 1,
250
  "noise_penalties": 0,
251
  "terminated_reason": "terminal_action",
252
+ "duration_seconds": 0.01
253
  },
254
  {
255
+ "episode_id": "65112da3-d319-4eb3-9774-1c43313fe1ec",
256
  "task_id": "architectural_review",
257
  "seed": 1,
258
  "final_score": 0.059,
 
261
  "issues_total": 1,
262
  "noise_penalties": 5,
263
  "terminated_reason": "noise_exhausted",
264
+ "duration_seconds": 0.03
265
  },
266
  {
267
+ "episode_id": "c1d6bd63-1abf-403b-97be-cf1962959910",
268
  "task_id": "architectural_review",
269
  "seed": 2,
270
  "final_score": 0.661,
 
273
  "issues_total": 1,
274
  "noise_penalties": 5,
275
  "terminated_reason": "noise_exhausted",
276
+ "duration_seconds": 0.03
277
  },
278
  {
279
+ "episode_id": "fab56551-f2ae-42be-88eb-854ef236b29a",
280
  "task_id": "architectural_review",
281
  "seed": 3,
282
  "final_score": 0.658,
 
288
  "duration_seconds": 0.03
289
  },
290
  {
291
+ "episode_id": "8f4b8fa9-f281-4bb2-ae71-7ad6c7ca7fc9",
292
  "task_id": "architectural_review",
293
  "seed": 4,
294
  "final_score": 0.058,
 
300
  "duration_seconds": 0.03
301
  },
302
  {
303
+ "episode_id": "7eaa445d-774d-44b1-80d9-06e978b022df",
304
  "task_id": "architectural_review",
305
  "seed": 5,
306
  "final_score": 0.657,
 
309
  "issues_total": 1,
310
  "noise_penalties": 5,
311
  "terminated_reason": "noise_exhausted",
312
+ "duration_seconds": 0.03
313
  },
314
  {
315
+ "episode_id": "103d6e2b-e40d-49ff-8f36-6910d463b48b",
316
  "task_id": "architectural_review",
317
  "seed": 6,
318
  "final_score": 0.059,
 
324
  "duration_seconds": 0.03
325
  },
326
  {
327
+ "episode_id": "5206e9d8-0ee0-482c-ad3d-34adf2ce57a7",
328
  "task_id": "architectural_review",
329
  "seed": 7,
330
  "final_score": 0.664,
 
333
  "issues_total": 1,
334
  "noise_penalties": 5,
335
  "terminated_reason": "noise_exhausted",
336
+ "duration_seconds": 0.03
337
  },
338
  {
339
+ "episode_id": "c147abd7-f887-44a2-b166-5382e292d793",
340
  "task_id": "architectural_review",
341
  "seed": 8,
342
  "final_score": 0.039,
 
345
  "issues_total": 1,
346
  "noise_penalties": 5,
347
  "terminated_reason": "noise_exhausted",
348
+ "duration_seconds": 0.08
349
  },
350
  {
351
+ "episode_id": "6fe54153-a878-47d7-a521-5cd70f55f6b8",
352
  "task_id": "architectural_review",
353
  "seed": 9,
354
  "final_score": 0.075,
 
357
  "issues_total": 1,
358
  "noise_penalties": 5,
359
  "terminated_reason": "noise_exhausted",
360
+ "duration_seconds": 0.09
361
  }
362
  ]
scripts/reset_db.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Reset the CodeLens database: deletes the SQLite file and re-initializes tables.
4
+ Useful for clearing test data and starting fresh evaluation benchmarks.
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ from pathlib import Path
10
+
11
+ # Add project root to path
12
+ sys.path.insert(0, str(Path(__file__).parent.parent))
13
+
14
+ from codelens_env.config import get_settings
15
+ from codelens_env.database import create_db_and_tables
16
+
17
+ def reset_db():
18
+ settings = get_settings()
19
+ db_path = Path(settings.db_path)
20
+
21
+ # 1. Delete existing database file
22
+ if db_path.exists():
23
+ print(f"Removing existing database at: {db_path}")
24
+ try:
25
+ os.remove(db_path)
26
+ print("Successfully deleted old records.")
27
+ except Exception as e:
28
+ print(f"Error deleting file: {e}")
29
+ sys.exit(1)
30
+ else:
31
+ print(f"No existing database found at {db_path}")
32
+
33
+ # 2. Re-initialize
34
+ print(f"Re-initializing schema...")
35
+ try:
36
+ create_db_and_tables()
37
+ print("Database reset successfully. You now have a clean dashboard.")
38
+ except Exception as e:
39
+ print(f"Error re-initializing: {e}")
40
+ sys.exit(1)
41
+
42
+ if __name__ == "__main__":
43
+ confirm = input("This will permanently delete all leaderboard and episode data. Proceed? [y/N]: ")
44
+ if confirm.lower() == 'y':
45
+ reset_db()
46
+ else:
47
+ print("Reset aborted.")
tests/conftest.py CHANGED
@@ -1,4 +1,6 @@
1
  import pytest
 
 
2
  from fastapi.testclient import TestClient
3
  from sqlmodel import SQLModel, Session, create_engine
4
  from sqlmodel.pool import StaticPool
 
1
  import pytest
2
+ import os
3
+ os.environ["TESTING"] = "true"
4
  from fastapi.testclient import TestClient
5
  from sqlmodel import SQLModel, Session, create_engine
6
  from sqlmodel.pool import StaticPool
tests/test_api.py CHANGED
@@ -3,15 +3,13 @@ from fastapi.testclient import TestClient
3
  from app import app
4
  from codelens_env.models import TaskId, ActionType, Category, Severity, Verdict
5
 
6
- def test_api_health():
7
- client = TestClient(app)
8
  response = client.get("/health")
9
  assert response.status_code == 200
10
  assert response.json()["status"] == "ok"
11
  assert response.json()["env_ready"] is True
12
 
13
- def test_api_workflow():
14
- client = TestClient(app)
15
 
16
  # 1. Reset
17
  reset_resp = client.post("/reset", json={"task_id": "bug_detection", "seed": 1})
@@ -34,8 +32,7 @@ def test_api_workflow():
34
  assert result_resp.status_code == 200
35
  assert result_resp.json()["final_score"] >= 0
36
 
37
- def test_api_leaderboard():
38
- client = TestClient(app)
39
  # Submit a score
40
  sub = {
41
  "agent_name": "test_agent",
@@ -55,8 +52,7 @@ def test_api_leaderboard():
55
  assert len(bug_entries) > 0
56
  assert bug_entries[0]["agent_name"] == "test_agent"
57
 
58
- def test_api_invalid_episode():
59
- client = TestClient(app)
60
  response = client.post("/step/nonexistent-id", json={
61
  "action_type": "comment",
62
  "body": "hello"
 
3
  from app import app
4
  from codelens_env.models import TaskId, ActionType, Category, Severity, Verdict
5
 
6
+ def test_api_health(client):
 
7
  response = client.get("/health")
8
  assert response.status_code == 200
9
  assert response.json()["status"] == "ok"
10
  assert response.json()["env_ready"] is True
11
 
12
+ def test_api_workflow(client):
 
13
 
14
  # 1. Reset
15
  reset_resp = client.post("/reset", json={"task_id": "bug_detection", "seed": 1})
 
32
  assert result_resp.status_code == 200
33
  assert result_resp.json()["final_score"] >= 0
34
 
35
+ def test_api_leaderboard(client):
 
36
  # Submit a score
37
  sub = {
38
  "agent_name": "test_agent",
 
52
  assert len(bug_entries) > 0
53
  assert bug_entries[0]["agent_name"] == "test_agent"
54
 
55
+ def test_api_invalid_episode(client):
 
56
  response = client.post("/step/nonexistent-id", json={
57
  "action_type": "comment",
58
  "body": "hello"