File size: 1,934 Bytes
c1972ef 4b66647 3e1edbb d8ee465 4b66647 cb3e1de 4b66647 f8670cd cb3e1de 4b66647 cb3e1de 4b66647 d8ee465 cb3e1de 4b66647 cb3e1de 4b66647 d8ee465 cb3e1de 4b66647 cb3e1de 4b66647 cb3e1de 4b66647 d581a4f 4b66647 d8ee465 4b66647 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 | version: "1.0.0"
name: "agentorg-codereview"
owners: ["Arsh Verma", "Divyansh Rawat"]
description: >
AI Senior Code Reviewer evaluation environment for CodeLens.
Benchmarks agents on 30 synthetic pull requests across Bug Detection,
Security Audit, and Architectural Review tasks.
entry_point: "app:app"
dashboard: "/dashboard"
api_docs: "/docs"
license: "MIT"
tags: ["code-review", "agentic-eval", "security-audit", "bug-detection"]
contact: "Arsh Verma <arsh@example.com>"
tasks:
- id: "bug_detection"
description: "Identify logical errors and edge cases in Python code"
max_steps: 10
scenarios: 10
difficulty_distribution:
easy: 2
medium: 6
hard: 2
- id: "security_audit"
description: "Detect OWASP Top 10 vulnerabilities in Python code"
max_steps: 15
scenarios: 10
difficulty_distribution:
easy: 1
medium: 7
hard: 2
- id: "architectural_review"
description: "Evaluate design patterns, coupling, and system constraints"
max_steps: 20
scenarios: 10
difficulty_distribution:
easy: 0
medium: 7
hard: 3
environment:
noise_budget: 5
line_tolerance_bug: 3
line_tolerance_arch: 5
keyword_match: "any" # agent body must contain ANY listed keyword
case_sensitive: false
grading:
type: "deterministic"
bug_detection:
coverage_weight: 0.4
avg_issue_score_weight: 0.6
issue_score:
keyword_weight: 0.5
severity_weight: 0.5
false_positive_penalty: 0.1
security_audit:
formula: "avg_issue_score"
issue_score:
severity_weight: 0.7
keyword_weight: 0.3
severity_scale:
critical: 4
high: 3
medium: 2
low: 1
info: 0
severity_penalty_per_level: 0.3
architectural_review:
issue_detection_weight: 0.6
verdict_weight: 0.2
quality_weight: 0.2
quality_min_body_length: 20
quality_max_body_length: 200
|