| version: "1.0.0" |
| name: "agentorg-codereview" |
| owners: ["Arsh Verma", "Divyansh Rawat"] |
| description: > |
| AI Senior Code Reviewer evaluation environment for CodeLens. |
| Benchmarks agents on 30 synthetic pull requests across Bug Detection, |
| Security Audit, and Architectural Review tasks. |
| |
| entry_point: "app:app" |
| dashboard: "/dashboard" |
| api_docs: "/docs" |
| license: "MIT" |
| tags: ["code-review", "agentic-eval", "security-audit", "bug-detection"] |
| contact: "Arsh Verma <arsh@example.com>" |
|
|
| tasks: |
| - id: "bug_detection" |
| description: "Identify logical errors and edge cases in Python code" |
| max_steps: 10 |
| scenarios: 10 |
| difficulty_distribution: |
| easy: 2 |
| medium: 6 |
| hard: 2 |
|
|
| - id: "security_audit" |
| description: "Detect OWASP Top 10 vulnerabilities in Python code" |
| max_steps: 15 |
| scenarios: 10 |
| difficulty_distribution: |
| easy: 1 |
| medium: 7 |
| hard: 2 |
|
|
| - id: "architectural_review" |
| description: "Evaluate design patterns, coupling, and system constraints" |
| max_steps: 20 |
| scenarios: 10 |
| difficulty_distribution: |
| easy: 0 |
| medium: 7 |
| hard: 3 |
|
|
| environment: |
| noise_budget: 5 |
| line_tolerance_bug: 3 |
| line_tolerance_arch: 5 |
| keyword_match: "any" |
| case_sensitive: false |
|
|
| grading: |
| type: "deterministic" |
| |
| bug_detection: |
| coverage_weight: 0.4 |
| avg_issue_score_weight: 0.6 |
| issue_score: |
| keyword_weight: 0.5 |
| severity_weight: 0.5 |
| false_positive_penalty: 0.1 |
|
|
| security_audit: |
| formula: "avg_issue_score" |
| issue_score: |
| severity_weight: 0.7 |
| keyword_weight: 0.3 |
| severity_scale: |
| critical: 4 |
| high: 3 |
| medium: 2 |
| low: 1 |
| info: 0 |
| severity_penalty_per_level: 0.3 |
|
|
| architectural_review: |
| issue_detection_weight: 0.6 |
| verdict_weight: 0.2 |
| quality_weight: 0.2 |
| quality_min_body_length: 20 |
| quality_max_body_length: 200 |
|
|