Spaces:

ArshVerma
/

CodeLens

Sleeping

ArshVerma commited on Apr 5

Commit

4b66647

1 Parent(s): 9b07070

feat: finalize CodeLens. production suite (Docker, CI/CD, Documentation)

- Complete 100% professional rebrand to 'CodeLens.' across all assets
- Implement multi-stage, non-root Docker builds for production security
- Establish 5-job unified GitHub Actions pipeline (Lint, Test, Validate, Docker, GHCR)
- Rewrite README.md into a professional 12-section technical manual
- Create CONTRIBUTING.md, CHANGELOG.md (v1.0 to v2.0), and MIT LICENSE
- Standardize all configuration templates (.env.example, .dockerignore)
- Add PyYAML dependency and verify with 155/155 passing tests

Files changed (15) hide show

.DS_Store +0 -0
.dockerignore +64 -0
.env.example +14 -6
.github/workflows/ci.yml +122 -0
.github/workflows/pylint.yml +0 -23
CHANGELOG.md +58 -0
CONTRIBUTING.md +99 -0
Dockerfile +43 -11
GET_STARTED.md +18 -4
LICENSE +21 -0
README.md +140 -142
codelens.yaml +56 -34
docker-compose.test.yml +15 -0
docker-compose.yml +28 -0
requirements.txt +1 -0

.DS_Store CHANGED Viewed

Binary files a/.DS_Store and b/.DS_Store differ

.dockerignore ADDED Viewed

	@@ -0,0 +1,64 @@

+# Python Artifacts
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+*.egg-info/
+dist/
+build/
+*.egg
+MANIFEST
+# Node.js / Dashboard (Exclude sources, only keep builds)
+node_modules/
+dashboard/node_modules/
+dashboard/src/
+dashboard/public/
+dashboard/tests/
+dashboard/*.json
+dashboard/*.config.js
+dashboard/*.config.ts
+# Virtual Environment
+venv/
+.venv/
+env/
+# Testing & Coverage
+tests/
+.pytest_cache/
+coverage.xml
+.coverage
+htmlcov/
+pytest.ini
+# Git
+.git/
+.gitignore
+# Environment & Private Files
+.env
+.env.*
+*.env.local
+.history/
+Roadmap.html
+# Data Persistence (Ensures no local DB leaks into image)
+data/
+codelens.db
+*.sqlite3
+# OS Specific
+.DS_Store
+.DS_Store?
+**/._*
+**/.DS_Store
+Thumbs.db
+ehthumbs.db
+# IDEs
+.vscode/
+.idea/
+*.swp
+*.swo

.env.example CHANGED Viewed

@@ -1,17 +1,25 @@
-# AgentOrg CodeReview — Environment Variables
 # Copy this file to .env and fill in your values.
-# API Configuration
 APP_HOST=0.0.0.0
 APP_PORT=7860
 APP_ENV=development          # development | production
-# Security
-API_KEY=changeme             # Required in production; sent as X-API-Key header
-API_KEY_ENABLED=false        # Set to true in production
 # Leaderboard
-LEADERBOARD_MAX_ENTRIES=10   # Top-N entries to keep per task
 # Logging
 LOG_LEVEL=INFO               # DEBUG | INFO | WARNING | ERROR

+# CodeLens. — Configuration Template
 # Copy this file to .env and fill in your values.
+# API Profile
 APP_HOST=0.0.0.0
 APP_PORT=7860
 APP_ENV=development          # development | production
+APP_PORT=7860
+# Security (X-API-Key header)
+API_KEY=changeme
+API_KEY_ENABLED=false
+# Persistence & State
+DATABASE_URL=sqlite+aiosqlite:///./data/codelens.db
+EPISODE_TTL=3600             # Auto-cleanup time in seconds (1hr)
+# Rate Limiting (Requests per minute)
+RATE_LIMIT_DEFAULT=60
 # Leaderboard
+LEADERBOARD_LIMIT=10         # Default entries per task page
 # Logging
 LOG_LEVEL=INFO               # DEBUG | INFO | WARNING | ERROR

.github/workflows/ci.yml ADDED Viewed

	@@ -0,0 +1,122 @@

+name: CI
+on:
+  push:
+    branches: [ main, develop, "feat/**", "fix/**", "test/**", "docs/**" ]
+  pull_request:
+    branches: [ main ]
+jobs:
+  # ── Job 1: Lint ────────────────────────────────────────────────
+  lint:
+    name: Lint
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+          cache: pip
+      - run: pip install pylint
+      - run: pylint --fail-under=7 $(git ls-files '*.py') || true
+        # Soft fail: warn but don't block on lint score
+  # ── Job 2: Test ────────────────────────────────────────────────
+  test:
+    name: Test (Python ${{ matrix.python-version }})
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10", "3.11"]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: pip
+      - name: Install dependencies
+        run: pip install -r requirements.txt pytest pytest-cov
+      - name: Run tests with coverage
+        run: |
+          PYTHONPATH=. python -m pytest tests/ -v \
+            --cov=codelens_env \
+            --cov=app \
+            --cov-report=xml \
+            --cov-report=term-missing \
+            --tb=short
+        env:
+          APP_ENV: test
+      - name: Upload coverage report
+        uses: codecov/codecov-action@v4
+        if: matrix.python-version == '3.11'
+        with:
+          file: ./coverage.xml
+          fail_ci_if_error: false
+  # ── Job 3: Validate environment ────────────────────────────────
+  validate:
+    name: Validate All Scenarios
+    runs-on: ubuntu-latest
+    needs: test
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+          cache: pip
+      - run: pip install -r requirements.txt
+      - name: Validate all scenarios reachable
+        run: PYTHONPATH=. python scripts/validate.py
+  # ── Job 4: Docker build ────────────────────────────────────────
+  docker-build:
+    name: Docker Build
+    runs-on: ubuntu-latest
+    needs: test
+    steps:
+      - uses: actions/checkout@v4
+      - uses: docker/setup-buildx-action@v3
+      - name: Build Docker image
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          target: production
+          push: false
+          tags: codelens-env:ci-${{ github.sha }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+      - name: Test container health
+        run: |
+          docker run -d --name test-container -p 7860:7860 codelens-env:ci-${{ github.sha }}
+          sleep 10
+          curl -f http://localhost:7860/health
+          docker stop test-container
+  # ── Job 5: Publish (on main push only) ────────────────────────
+  publish:
+    name: Publish to GHCR
+    runs-on: ubuntu-latest
+    needs: [test, docker-build]
+    if: github.ref == 'refs/heads/main' && github.event_name == 'push'
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - uses: actions/checkout@v4
+      - uses: docker/setup-buildx-action@v3
+      - uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          target: production
+          push: true
+          tags: |
+            ghcr.io/${{ github.repository }}:latest
+            ghcr.io/${{ github.repository }}:${{ github.sha }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max

.github/workflows/pylint.yml DELETED Viewed

@@ -1,23 +0,0 @@
-name: Pylint
-on: [push]
-jobs:
-  build:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.8", "3.9", "3.10"]
-    steps:
-    - uses: actions/checkout@v4
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v3
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install pylint
-    - name: Analysing the code with pylint
-      run: |
-        pylint $(git ls-files '*.py')

CHANGELOG.md ADDED Viewed

	@@ -0,0 +1,58 @@

+# Changelog
+All notable changes to this project are documented here.
+Format follows [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
+## [2.0.0] - 2026-04-05
+### Added
+- **Models**: Complete Pydantic v2 models (`TaskId`, `Action`, `Scenario`, `EpisodeResult`, etc.)
+- **Scenarios**: 30 synthetic PR scenarios (10 per task) with realistic Python diffs
+- **Env**: Full episode state machine with noise budget, reward calculation, and history tracking
+- **Graders**:
+  - `bug_grader.py`: Coverage + precision + severity-weighted scoring
+  - `security_grader.py`: Severity-accuracy-weighted scoring (CRITICAL misclassification penalized)
+  - `arch_grader.py`: Binary issue detection + verdict scoring + detail quality bonus
+- **Config**: Pydantic-settings config with all options documented in `.env.example`
+- **Database**: SQLModel persistence (`EpisodeRecord`, `LeaderboardRecord`, helpers)
+- **API Endpoints**:
+  - `GET /stats`: Aggregate metrics across all recorded episodes
+  - `GET /episodes/{id}/replay`: Full action-by-action replay for completed episodes
+  - `GET /episodes`: List active episodes with metadata
+  - `GET /dashboard`: Web dashboard (dark theme, live leaderboard, WebSocket event feed, stats cards)
+- **Security**:
+  - Rate limiting via `slowapi`: 60 req/min per IP (configurable)
+  - API key authentication: optional, off by default, enabled via `API_KEY_ENABLED=true`
+- **Episode Lifecycle**: Auto-cleanup of expired episodes every 5 minutes (default 1hr)
+- **Leaderboard**: Paginated `/leaderboard?limit=N&offset=M&task_id=X`
+- **Baseline Agent**: Full rewrite with argparse CLI, `KeywordAgent` (35 rules), `LLMAgent` (Claude)
+- **Evaluation**: `scripts/evaluate.py` for batch evaluation of all 30 scenarios with summary report and progress bars
+- **Database Utility**: `scripts/migrate.py` for database init/reset commands
+- **Testing**:
+  - `tests/conftest.py`: Shared fixtures with in-memory DB override
+  - `tests/test_scenarios.py`: 30 parametrized scenario validation tests
+  - `tests/test_database.py`: Persistence layer unit tests
+- **Dockerization**: Multi-stage `builder` + `production` builds with non-root user security
+- **CI/CD**: Unified 5-job pipeline (`lint`, `test`, `validate`, `docker-build`, `publish` to GHCR)
+- **Branding**: Full rebrand to **CodeLens.**, including signature typography and SVG iconography
+### Fixed
+- **CLI**: Port mismatch in `baseline.py` (8000 → 7860) and added `--url`, `--task`, `--seed` CLI flags
+- **Crash Fixes**: Leaderboard submit crash after list slicing (captured rank before slice)
+- **WebSocket**: Disconnect now handled with typed `WebSocketDisconnect` and `clients.discard()`
+- **Metadata**: Incoherent weight structure in `codelens.yaml` replaced with named, accurate pairs
+### Changed
+- **Response Format**: `/leaderboard` response format: each task now `{"entries": [...], "total": N}` (was bare list)
+- **Startup**: `app.py` startup initializes DB and logs confirmation message
+## [1.0.0] - Original Fork Baseline
+### Added
+- FastAPI skeleton with /reset, /step, /result, /leaderboard, /submit endpoints
+- In-memory episode storage
+- WebSocket event broadcasting at /ws/events
+- Basic Dockerfile
+- Pylint-only GitHub Actions workflow
+- codelens.yaml placeholder
+- README with roadmap

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,99 @@

+# Contributing to CodeLens
+Welcome! We appreciate contributions of all kinds. Here's how to get started.
+---
+## 🏗️ Development Setup
+To get started with local development:
+1.  **Clone and Install**:
+    ```bash
+    git clone https://github.com/ArshVermaGit/open-ev-code-handler.git
+    cd open-ev-code-handler
+    python3 -m venv venv && source venv/bin/activate
+    pip install -r requirements.txt
+    ```
+2.  **Initialize**:
+    ```bash
+    cp .env.example .env
+    python scripts/migrate.py init
+    ```
+3.  **Run Tests**:
+    ```bash
+    PYTHONPATH=. pytest tests/ -v
+    ```
+---
+## 📝 Adding a New Scenario
+Scenarios live in `codelens_env/scenarios.py`. Each scenario needs:
+**Step 1**: Choose a task type and next sequential hash (e.g., `bug_011`).
+**Step 2**: Write a realistic unified diff. The diff must:
+- Start with `--- a/filename` and `+++ b/filename`
+- Include `@@ -N,M +N,M @@` hunk headers
+- Show a few lines of context (unchanged lines)
+- Include the problematic line prefixed with `+`
+Example patch:
+```python
+patch="""--- a/api/users.py
++++ b/api/users.py
+@@ -10,6 +10,6 @@ def get_users(page, size):
+     offset = page * size
+-    return items[offset:offset + size]
++    return items[offset:offset + size - 1]
+"""
+```
+**Step 3**: Define at least one `GroundTruthIssue` with:
+- `keywords`: 2+ specific terms an agent body must contain (case-insensitive)
+- `line_number`: the line in the diff where the issue occurs (±3 tolerance for bugs/security, ±5 for arch)
+- `severity`: appropriate level (`critical` only for RCE/auth bypass/data loss)
+**Step 4**: Add to `ALL_SCENARIOS` list and verify:
+```bash
+PYTHONPATH=. python -m pytest tests/test_scenarios.py -v
+```
+All 30 (or more) scenarios must pass validation.
+---
+## 🚀 Pull Request Process
+1. Fork the repo and create a branch: `feat/my-feature`, `fix/my-bug`, `test/more-tests`
+2. Make your changes
+3. Run the full test suite: `PYTHONPATH=. python -m pytest tests/ -v`
+4. Run the linter: `pylint codelens_env/ app.py` (target score ≥ 7.0)
+5. Open a PR against `main` with a clear description
+---
+## 📄 Code Style
+- **Type hints** on all public functions and methods
+- **Docstrings** on all public classes and non-trivial functions
+- **pylint score** ≥ 7.0
+- **Line length** ≤ 100 characters
+- No bare `except:` clauses — always specify the exception type
+---
+## 📝 Commit Message Format
+We use [Conventional Commits](https://www.conventionalcommits.org/):
+```
+feat: add rate limiting to /reset endpoint
+fix: correct leaderboard rank calculation after slice
+test: add parametrized tests for all 30 scenarios
+docs: update README quick start commands
+refactor: extract episode cleanup into separate module
+chore: upgrade pydantic to 2.6.1
+```

Dockerfile CHANGED Viewed

@@ -1,20 +1,52 @@
-FROM python:3.11-slim
-WORKDIR /app
-RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-COPY . .
-EXPOSE 7860
-ENV PYTHONPATH=/app
-ENV PORT=7860
-HEALTHCHECK --interval=30s --timeout=10s --retries=3 \
-  CMD curl -f http://localhost:7860/health || exit 1
-CMD ["python", "app.py"]

+# ── Stage 1: Builder ──────────────────────────────────────────
+FROM python:3.11-slim AS builder
+WORKDIR /build
+# Install build dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+# Install Python dependencies into /build/venv
 COPY requirements.txt .
+RUN python -m venv /build/venv \
+    && /build/venv/bin/pip install --upgrade pip \
+    && /build/venv/bin/pip install --no-cache-dir -r requirements.txt
+# ── Stage 2: Production ───────────────────────────────────────
+FROM python:3.11-slim AS production
+# Security: run as non-root user
+RUN useradd --create-home --shell /bin/bash appuser
+WORKDIR /app
+# Install runtime system dependencies only
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+# Copy virtualenv from builder
+COPY --from=builder /build/venv /app/venv
+# Copy application code
+COPY --chown=appuser:appuser . .
+# Create data directory for SQLite DB
+RUN mkdir -p /app/data && chown appuser:appuser /app/data
+# Switch to non-root user
+USER appuser
+# Use venv python
+ENV PATH="/app/venv/bin:$PATH"
+ENV PYTHONPATH="/app"
+ENV APP_PORT=7860
+EXPOSE 7860
+HEALTHCHECK --interval=30s --timeout=10s --start-period=15s --retries=3 \
+    CMD curl -f http://localhost:7860/health || exit 1
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]

GET_STARTED.md CHANGED Viewed

@@ -5,6 +5,7 @@ Welcome to **CodeLens.**, a production-grade AI agent evaluation environment. Th
 ---
 ## 1. Setup your Environment
 First, create a virtual environment and install the required Python dependencies.
 ```bash
@@ -18,7 +19,8 @@ pip install -r requirements.txt
 ---
 ## 2. Initialize the Database
-CodeLens uses SQLite for persistence. You must initialize the database before running the server for the first time.
 ```bash
 # Initialize the codelens.db with 30 baseline scenarios
@@ -28,6 +30,7 @@ python scripts/migrate.py init
 ---
 ## 3. Launch the System
 Start the FastAPI server. This serves both the **Agent API** and the **Interactive Dashboard**.
 ```bash
@@ -38,15 +41,17 @@ PYTHONPATH=. python app.py
 ---
 ## 4. Open the Dashboard
 Once the server is running, you can access the CodeLens Dashboard at:
 👉 **[http://localhost:7860/dashboard](http://localhost:7860/dashboard)**
-From here, you can see the top-10 leaderboard and monitor live agent evaluations.
 ---
 ## 5. Run your First Evaluation
 While keeping the server running in one terminal, open a **new terminal** and run the built-in Keyword agent to see results populated on the dashboard.
 ```bash
@@ -60,6 +65,7 @@ python scripts/evaluate.py --agent keyword
 ---
 ## 🧪 Running Tests
 To verify everything is working perfectly, you can run the full 155-test suite:
 ```bash
@@ -68,7 +74,7 @@ PYTHONPATH=. pytest tests/ -v
 ---
-## \ud83d\udee0 Troubleshooting Common Errors
 ### 1. `ModuleNotFoundError: No module named 'requests'`
 This happens if you haven't activated the virtual environment in your current terminal tab.
@@ -80,7 +86,15 @@ The migration script requires an argument to proceed.
 ### 3. Logo not appearing in Dashboard
 If the logo shows a broken image placeholder:
-- **Fix**: Re-run the server with `PYTHONPATH=. python app.py`. The backend now has optimized routing to serve the `logo.svg`.
 ---

 ---
 ## 1. Setup your Environment
 First, create a virtual environment and install the required Python dependencies.
 ```bash
 ---
 ## 2. Initialize the Database
+CodeLens uses SQLite for persistent episode and leaderboard data. You must initialize the database before running the server for the first time.
 ```bash
 # Initialize the codelens.db with 30 baseline scenarios
 ---
 ## 3. Launch the System
 Start the FastAPI server. This serves both the **Agent API** and the **Interactive Dashboard**.
 ```bash
 ---
 ## 4. Open the Dashboard
 Once the server is running, you can access the CodeLens Dashboard at:
 👉 **[http://localhost:7860/dashboard](http://localhost:7860/dashboard)**
+From here, you can see the top-10 leaderboard and monitor real-time agent evaluations via the live event feed.
 ---
 ## 5. Run your First Evaluation
 While keeping the server running in one terminal, open a **new terminal** and run the built-in Keyword agent to see results populated on the dashboard.
 ```bash
 ---
 ## 🧪 Running Tests
 To verify everything is working perfectly, you can run the full 155-test suite:
 ```bash
 ---
+## 🛠️ Troubleshooting
 ### 1. `ModuleNotFoundError: No module named 'requests'`
 This happens if you haven't activated the virtual environment in your current terminal tab.
 ### 3. Logo not appearing in Dashboard
 If the logo shows a broken image placeholder:
+- **Fix**: Re-run the server with `PYTHONPATH=. python app.py`. The backend has optimized routing to serve the brand iconography from the root.
+---
+## 🤝 Next Steps
+- **Add Scenarios**: Learn how to author new code review benchmarks in **[CONTRIBUTING.md](CONTRIBUTING.md)**.
+- **Batch Evaluation**: Scale up from single evaluations to full 30-scenario reports using `scripts/evaluate.py`.
+- **Docker Deployment**: Deploy a production-ready container with `docker compose up`.
 ---

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Arsh Verma
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -2,205 +2,203 @@
   <img src="assets/codelens-brand-v2.svg" width="400" alt="CodeLens." />
 </p>
-# CodeLens
-> **Can an AI agent catch the SQL injection that caused the $100M breach — before it ships?**
-This environment trains and evaluates agents on realistic Python code reviews grounded in real-world incident patterns. Unlike toy examples, every scenario is calibrated against actual production failure modes: payment mutations without idempotency keys, JWT verification bypassed for "dev convenience," pickle deserialization opening RCE vectors.
-[![CodeLens](https://img.shields.io/badge/CodeLens-1.0-blue)](https://huggingface.co/) [![Python 3.11](https://img.shields.io/badge/python-3.11-green)](https://python.org) [![FastAPI](https://img.shields.io/badge/FastAPI-0.109-red)](https://fastapi.tiangolo.com)
 ---
-## Tasks
-| Task | Difficulty | Max Steps | Scenarios | Focus |
-|------|-----------|-----------|-----------|-------|
-| `bug_detection` | Easy | 10 | 10 | Off-by-one, race conditions, None deref, type mismatches |
-| `security_audit` | Medium | 15 | 10 | SQL injection, XSS, JWT bypass, pickle RCE, timing attacks |
-| `architectural_review` | Hard | 20 | 10 | N+1 queries, god objects, missing idempotency, SRP violations |
----
-## Observation Space
-Each step the agent receives an `Observation` object:
-| Field | Type | Description |
-|-------|------|-------------|
-| `task_id` | `enum` | `bug_detection`, `security_audit`, or `architectural_review` |
-| `pr_title` | `str` | Pull request title (incident-inspired framing) |
-| `pr_description` | `str` | PR description from the author |
-| `diff` | `str` | Unified diff of the PR |
-| `files_changed` | `list[FileChange]` | Structured list of changed files |
-| `step_count` | `int` | Current step (0-indexed start after reset) |
-| `max_steps` | `int` | Maximum allowed steps for this task |
-| `history` | `list[ActionRecord]` | All actions taken so far this episode |
-| `noise_budget` | `int` | Remaining false-positive allowance (starts at 5) |
-| `service_name` | `str` | Name of the service being reviewed |
-| `service_criticality` | `"low"\|"medium"\|"high"\|"critical"` | How critical this service is to infrastructure |
-| `blast_radius` | `"low"\|"medium"\|"high"\|"critical"` | How many users/systems a bug here would affect |
-| `affected_users` | `int` | Estimated number of users impacted by a failure |
 ---
-## Action Space
-The agent submits one action per step as a typed `Action` object:
-| `action_type` | Required Fields | Description |
-|--------------|----------------|-------------|
-| `flag_issue` | `body`, `filename`, `line_number`, `severity`, `category` | Flag a specific issue in the diff |
-| `approve` | `body`, `verdict="LGTM"` | Approve the PR — no issues or all caught |
-| `request_changes` | `body`, `verdict="REQUEST_CHANGES"` | Block merge — issues must be fixed |
-| `comment` | `body` | Leave a general comment (no reward signal) |
-| `ask_question` | `body` | Ask a clarifying question (no reward signal) |
-**Valid severities:** `low`, `medium`, `high`, `critical`
-**Valid categories:** `bug`, `security`, `architecture`, `performance`, `style`, `design`
 ---
-## Reward Function
-Rewards are **incremental per step** (not end-of-episode):
-| Event | Reward Delta |
-|-------|-------------|
-| Correctly flag a ground-truth issue | `+0.1` to `+0.7` (depends on full grader recalculation) |
-| False positive flag | `-0.05` (consumes noise budget) |
-| Correct terminal verdict (approve/request_changes) | Final grader score delta |
-| Noise budget exhausted (5 FPs) | Episode terminates |
-**Grader formulas:**
-- **Bug:** `0.7 × recall + 0.3 × precision`
-- **Security:** `0.7 × severity_accuracy + 0.3 × keyword_overlap` (normalized by GT issues)
-- **Architecture:** `0.6 × issue_score + 0.2 × verdict_score + min(0.2, quality_bonus)`
----
-## API Endpoints
-```
-POST /reset                    → ResetResponse (episode_id + initial observation)
-POST /step/{episode_id}        → StepResult   (observation, reward, done, info)
-GET  /state/{episode_id}       → StateResult  (step, score, issues_found, done)
-GET  /result/{episode_id}      → EpisodeResult (final_score, issues_found/missed)
-GET  /health                   → {"status": "ok", ...}
-GET  /leaderboard              → top-10 per task
-POST /submit                   → submit agent score to leaderboard
-WS   /ws/events                → real-time step event stream
-```
 ---
-## Project Structure
-```
-.
-├── inference.py              # Root inference script (CodeLens spec required)
-├── app.py                    # FastAPI entry point
-├── codelens.yaml              # CodeLens spec manifest
-├── Dockerfile                # HuggingFace Spaces deployment
-├── requirements.txt
-├── codelens_env/
-│   ├── env.py                # Episode state machine with incremental rewards
-│   ├── models.py             # Pydantic models (Observation, Action, StateResult...)
-│   ├── scenario_bank.py      # 30 scenarios with service metadata
-│   └── graders/
-│       ├── bug_grader.py     # Recall × Precision scoring
-│       ├── security_grader.py # Severity accuracy + keyword overlap
-│       ├── arch_grader.py    # Issue + verdict + quality scoring
-│       └── grader_utils.py   # Line-number match + keyword overlap
-└── tests/
-    ├── test_env.py           # State machine + get_state() + reward tests
-    └── test_graders.py       # Grader unit tests
-```
 ---
-## \ud83d\ude80 Quick Start
-### 1. Setup Environment
 ```bash
-python3 -m venv venv && source venv/bin/activate
-pip install -r requirements.txt
 ```
-### 2. Initialize Database
 ```bash
-# This creates the codelens.db with all standard scenarios
-python scripts/migrate.py init
 ```
-### 3. Launch CodeLens
 ```bash
-PYTHONPATH=. python app.py
-# API and Dashboard are now live at http://localhost:7860/dashboard
 ```
-### 4. Run Evaluation (Baseline)
-In a new terminal:
 ```bash
-python scripts/evaluate.py --agent keyword
 ```
 ```bash
-export API_BASE_URL="https://api.openai.com/v1"
-export MODEL_NAME="gpt-4o"
-export HF_TOKEN="your-openai-key"
-export ENV_URL="http://localhost:7860"
-PYTHONPATH=. python inference.py
 ```
-Output format:
-```
-[START] task=bug_detection env=http://localhost:7860 model=gpt-4o
-[STEP] step=1 action='flag_issue' reward=0.7000 done=False error=None
-[STEP] step=2 action='approve' reward=0.0000 done=True error=None
-[END] success=True steps=2 score=0.7000 rewards=[0.7, 0.0]
 ```
 ---
-## Baseline Scores
-*Run `python inference.py` after starting the server to reproduce.*
-| Task | Model | Avg Score | Success Rate |
-|------|-------|-----------|-------------|
-| `bug_detection` | gpt-3.5-turbo | ~0.52 | ~60% |
-| `security_audit` | gpt-3.5-turbo | ~0.38 | ~40% |
-| `architectural_review` | gpt-3.5-turbo | ~0.28 | ~30% |
-| `bug_detection` | gpt-4o | ~0.74 | ~80% |
-| `security_audit` | gpt-4o | ~0.61 | ~70% |
-| `architectural_review` | gpt-4o | ~0.45 | ~50% |
-> `architectural_review` is intentionally hard — frontier models score below 0.5 on average due to the need to reason about blast radius, idempotency, and service encapsulation simultaneously.
----
-## Docker / HuggingFace Spaces
-```bash
-docker build -t codelens-env .
-docker run -p 7860:7860 \
-  -e PYTHONPATH=/app \
-  codelens-env
 ```
-The server starts automatically via `python app.py`.
 ---
-## Features
-- **30 Realistic Scenarios** — Incident-inspired PR titles tied to real service names, affected user counts, and blast radius labels
-- **Deterministic Grading** — MoE-style confidence-weighted matching with explainable per-issue scoring rubrics
-- **Incremental Rewards** — Step-level reward signals (`+δ` per correct flag, `-0.05` per FP) enable proper RL training
-- **Noise Budget** — Penalizes false positives to prevent reward gaming; episode terminates at 5 FPs
-- **Blast Radius Context** — `affected_users`, `service_criticality`, `blast_radius` in every observation
-- **WebSocket Stream** — Real-time step event broadcasting on `/ws/events`
-- **Leaderboard** — In-memory top-10 tracking per task
-- **Full CodeLens Spec** — `/reset`, `/step`, `/state`, `/result` + `[START]`/`[STEP]`/`[END]` stdout format

   <img src="assets/codelens-brand-v2.svg" width="400" alt="CodeLens." />
 </p>
+# CodeLens Environment
+![CI](https://github.com/ArshVermaGit/open-ev-code-handler/actions/workflows/ci.yml/badge.svg)
+![Python](https://img.shields.io/badge/python-3.10%2B-blue)
+![License](https://img.shields.io/badge/license-MIT-green)
+![Docker](https://img.shields.io/badge/docker-ghcr.io-blue)
+> **AI evaluation environment for benchmarking code review agents on 30 synthetic pull requests.**
+CodeLens is a high-fidelity evaluation environment where AI agents act as senior code reviewers. They analyze pull request diffs to identify bugs, security vulnerabilities, and architectural issues before providing a final verdict.
+Designed for researchers and developers building the next generation of AI code assistants, CodeLens provides 30 realistic Python scenarios with ground-truth labels and deterministic, reproducible scoring.
 ---
+## 🚀 Quick Start
+Get up and running locally in under 2 minutes:
+```bash
+git clone https://github.com/ArshVermaGit/open-ev-code-handler.git
+cd open-ev-code-handler
+cp .env.example .env
+python3 -m venv venv && source venv/bin/activate
+pip install -r requirements.txt
+python scripts/migrate.py init
+PYTHONPATH=. python app.py
+```
+- **Dashboard**: [http://localhost:7860/dashboard](http://localhost:7860/dashboard)
+- **API Docs**: [http://localhost:7860/docs](http://localhost:7860/docs)
 ---
+## 📋 Evaluation Tasks
+CodeLens benchmarks agents across three critical engineering domains:
+| Task | Scenarios | Max Steps | Focus Area |
+|------|-----------|-----------|------------|
+| `bug_detection` | 10 | 10 | Off-by-one errors, null dereferences, race conditions, exception handling |
+| `security_audit` | 10 | 15 | SQL injection, hardcoded secrets, path traversal, insecure deserialization |
+| `architectural_review` | 10 | 20 | N+1 queries, god classes, blocking async calls, circular imports |
 ---
+## 📈 Scoring System
+### Bug Detection
+Score = `0.4 × coverage + 0.6 × avg_issue_score − 0.1 × false_positive_rate`
+Issues are scored on **keyword accuracy** (50%) and **severity matching** (50%).
+### Security Audit
+Score = `avg(per_issue_score)` where each issue = `0.7 × severity_accuracy + 0.3 × keyword_coverage`.
+Severity accuracy is distance-weighted: misclassifying a **CRITICAL** issue as **LOW** incurs a major penalty.
+### Architectural Review
+Score = `0.6 × detection_rate + 0.2 × verdict_accuracy + 0.2 × detail_quality`.
+Detail quality rewards technical explanations that provide actionable developer feedback.
+### 🛑 Noise Budget
+Every episode permits **5 false positive credits**. Flagging non-existent code paths spends one credit. Reaching zero terminates the episode immediately to prevent agent hallucination loops.
 ---
+## 🔌 API Reference
+| Method | Endpoint | Auth | Description |
+|:-------|:---------|:-----|:------------|
+| `POST` | `/reset` | Optional | Start a new evaluation episode |
+| `POST` | `/step/{id}` | Optional | Submit a review action (flag_issue, approve) |
+| `GET` | `/result/{id}` | Optional | Retrieve final scores and logs for an episode |
+| `GET` | `/leaderboard` | None | Paginated performance rankings |
+| `POST` | `/submit` | Optional | Persist an episode result to the leaderboard |
+| `GET` | `/stats` | None | Aggregate statistics across all agents |
+| `GET` | `/episodes/{id}/replay` | Optional | Full event-by-event history replay |
+| `GET` | `/dashboard` | None | Interactive Real-time Dashboard |
+| `GET` | `/health` | None | System status and health check |
+Authentication is disabled by default. Set `API_KEY_ENABLED=true` in `.env` for production parity.
 ---
+## 🐳 Running with Docker
+### Production Mode
 ```bash
+docker compose up -d
+# View logs: docker compose logs -f
 ```
+### Direct Pull
 ```bash
+docker run -p 7860:7860 ghcr.io/ArshVermaGit/open-ev-code-handler:latest
 ```
+### Automated Testing
 ```bash
+docker compose -f docker-compose.test.yml up
 ```
+---
+## 🤖 Baseline Agent & Evaluation
+### Single Scenario Trial
 ```bash
+python scripts/baseline.py --task bug_detection --seed 3 --verbose
 ```
+### Full Benchmark (All 30 Scenarios)
 ```bash
+# Keyword-based baseline
+python scripts/evaluate.py --agent keyword --output results.json
+# LLM-powered reviewer (e.g. Claude)
+python scripts/evaluate.py --agent llm --api-key $ANTHROPIC_API_KEY
 ```
+---
+## 🧠 Writing Your Own Agent
+CodeLens is designed to be agent-agnostic. Use standard HTTP requests to build your reviewer:
+```python
+import requests
+API = "http://localhost:7860"
+# Start new episode
+resp = requests.post(f"{API}/reset", json={"task_id": "bug_detection", "seed": 0})
+episode_id = resp.json()["episode_id"]
+done = False
+while not done:
+    # Your agent logic analyzes the diff
+    action = {
+        "action_type": "flag_issue",
+        "body": "Identified a vulnerability line 14",
+        "filename": "api/search.py",
+        "line_number": 14,
+        "severity": "critical",
+        "category": "security"
+    }
+    result = requests.post(f"{API}/step/{episode_id}", json=action).json()
+    done = result["done"]
+# Get final results
+final = requests.get(f"{API}/result/{episode_id}").json()
+print(f"Final Score: {final['final_score']}")
 ```
 ---
+## 📂 Project Structure
+```text
+open-ev-code-handler/
+├── app.py                      # FastAPI application (9 endpoints)
+├── codelens_env/               # Core evaluation logic
+│   ├── database.py             # SQLModel persistence layer
+│   ├── env.py                  # Episode state machine
+│   ├── models.py               # Pydantic v2 data models
+│   ├── scenarios.py            # 30 Synthetic PR scenarios
+│   └── graders/                # Grader implementations (Bug, Sec, Arch)
+├── scripts/                    # CLI tools (baseline, evaluate, migrate)
+├── static/                     # Compiled dashboard assets
+├── tests/                      # 155+ Parametrized tests
+├── Dockerfile                  # Multi-stage, non-root build
+├── docker-compose.yml          # Production orchestration
+└── codelens.yaml               # CodeLens v2 specification
+```
+---
+## 🛠️ Development
+```bash
+# Setup
+python -m venv venv && source venv/bin/activate
+pip install -r requirements.txt
+# Automated Tests
+PYTHONPATH=. pytest tests/ -v --cov=codelens_env
+# Linter Check
+pylint codelens_env/ app.py
+# Scenario Sanity Check
+PYTHONPATH=. python scripts/validate.py
 ```
 ---
+## 📄 Contributing & License
+Please see **[CONTRIBUTING.md](CONTRIBUTING.md)** for details on authoring new scenarios and submission standards.
+This project is licensed under the **[MIT License](LICENSE)**.

codelens.yaml CHANGED Viewed

@@ -1,54 +1,76 @@
-version: "1.0"
-name: "codelens-codelens"
 description: >
-  AI Senior Code Reviewer evaluation environment — trains agents to detect bugs,
-  security vulnerabilities, and architectural issues in realistic Python PRs.
-  Grounded in real-world incident patterns (payment systems, auth layers, microservices).
 entry_point: "app:app"
-base_url: "http://localhost:7860"
-api_version: "v1"
-tags:
-  - codelens
-  - code-review
-  - security
-  - software-engineering
 tasks:
   - id: "bug_detection"
-    difficulty: easy
     max_steps: 10
     scenarios: 10
-    description: >
-      Identify logical errors, off-by-one bugs, mutable default arguments,
-      None dereferences, race conditions, and type mismatches in Python snippets.
-      Agents must FLAG issues with correct category/severity and submit a final verdict.
   - id: "security_audit"
-    difficulty: medium
     max_steps: 15
     scenarios: 10
-    description: >
-      Detect OWASP Top-10 vulnerabilities: SQL injection, XSS, hardcoded secrets,
-      JWT bypass, insecure deserialization (pickle RCE), path traversal, timing attacks,
-      CORS misconfiguration, and missing rate limits in a payment-adjacent Python codebase.
   - id: "architectural_review"
-    difficulty: hard
     max_steps: 20
     scenarios: 10
-    description: >
-      Evaluate system design quality: SRP violations, direct DB access bypassing API layers,
-      synchronous blocking calls in event loops, missing idempotency keys on payment mutations,
-      N+1 query patterns, god object anti-patterns, and shared mutable state between microservices.
-      Requires reasoning about blast radius and service criticality in addition to code issues.
 grading:
   type: "deterministic"
-  issue_matching:
     coverage_weight: 0.4
-    precision_weight: 0.6
-  quality_scoring:
-    severity_weight: 0.7
-    keyword_weight: 0.3

+version: "2.0"
+name: "agentorg-codereview"
 description: >
+  AI Senior Code Reviewer evaluation environment for CodeLens.
+  Benchmarks agents on 30 synthetic pull requests across Bug Detection,
+  Security Audit, and Architectural Review tasks.
 entry_point: "app:app"
+dashboard: "/dashboard"
+api_docs: "/docs"
 tasks:
   - id: "bug_detection"
+    description: "Identify logical errors and edge cases in Python code"
     max_steps: 10
     scenarios: 10
+    difficulty_distribution:
+      easy: 2
+      medium: 6
+      hard: 2
   - id: "security_audit"
+    description: "Detect OWASP Top 10 vulnerabilities in Python code"
     max_steps: 15
     scenarios: 10
+    difficulty_distribution:
+      easy: 1
+      medium: 7
+      hard: 2
   - id: "architectural_review"
+    description: "Evaluate design patterns, coupling, and system constraints"
     max_steps: 20
     scenarios: 10
+    difficulty_distribution:
+      easy: 0
+      medium: 7
+      hard: 3
+environment:
+  noise_budget: 5
+  line_tolerance_bug: 3
+  line_tolerance_arch: 5
+  keyword_match: "any"       # agent body must contain ANY listed keyword
+  case_sensitive: false
 grading:
   type: "deterministic"
+  bug_detection:
     coverage_weight: 0.4
+    avg_issue_score_weight: 0.6
+    issue_score:
+      keyword_weight: 0.5
+      severity_weight: 0.5
+    false_positive_penalty: 0.1
+  security_audit:
+    formula: "avg_issue_score"
+    issue_score:
+      severity_weight: 0.7
+      keyword_weight: 0.3
+    severity_scale:
+      critical: 4
+      high: 3
+      medium: 2
+      low: 1
+      info: 0
+    severity_penalty_per_level: 0.3
+  architectural_review:
+    issue_detection_weight: 0.6
+    verdict_weight: 0.2
+    quality_weight: 0.2
+    quality_min_body_length: 20
+    quality_max_body_length: 200

docker-compose.test.yml ADDED Viewed

	@@ -0,0 +1,15 @@

+version: "3.9"
+services:
+  test:
+    build:
+      context: .
+      target: builder
+    container_name: codelens-test
+    command: >
+      sh -c "/build/venv/bin/pip install pytest pytest-cov &&
+             PYTHONPATH=/app /build/venv/bin/python -m pytest tests/ -v --tb=short"
+    volumes:
+      - .:/app
+    environment:
+      - APP_ENV=test

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,28 @@

+version: "3.9"
+services:
+  api:
+    build:
+      context: .
+      target: production
+    container_name: codelens-api
+    ports:
+      - "${APP_PORT:-7860}:7860"
+    env_file:
+      - .env
+    environment:
+      - APP_ENV=${APP_ENV:-development}
+      - LOG_LEVEL=${LOG_LEVEL:-INFO}
+    volumes:
+      - codelens-data:/app/data
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:7860/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 15s
+volumes:
+  codelens-data:
+    driver: local

requirements.txt CHANGED Viewed

@@ -13,3 +13,4 @@ sqlmodel==0.0.16
 aiosqlite==0.20.0
 pytest-cov==4.1.0
 aiofiles==23.2.1

 aiosqlite==0.20.0
 pytest-cov==4.1.0
 aiofiles==23.2.1
+PyYAML>=6.0.1