File size: 1,934 Bytes
c1972ef
4b66647
3e1edbb
d8ee465
4b66647
 
 
 
cb3e1de
4b66647
 
f8670cd
 
 
cb3e1de
 
 
4b66647
cb3e1de
 
4b66647
 
 
 
d8ee465
cb3e1de
4b66647
cb3e1de
 
4b66647
 
 
 
d8ee465
cb3e1de
4b66647
cb3e1de
 
4b66647
 
 
 
 
 
 
 
 
 
 
cb3e1de
 
 
4b66647
 
d581a4f
4b66647
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8ee465
4b66647
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
version: "1.0.0"
name: "agentorg-codereview"
owners: ["Arsh Verma", "Divyansh Rawat"]
description: >
  AI Senior Code Reviewer evaluation environment for CodeLens.
  Benchmarks agents on 30 synthetic pull requests across Bug Detection,
  Security Audit, and Architectural Review tasks.
  
entry_point: "app:app"
dashboard: "/dashboard"
api_docs: "/docs"
license: "MIT"
tags: ["code-review", "agentic-eval", "security-audit", "bug-detection"]
contact: "Arsh Verma <arsh@example.com>"

tasks:
  - id: "bug_detection"
    description: "Identify logical errors and edge cases in Python code"
    max_steps: 10
    scenarios: 10
    difficulty_distribution:
      easy: 2
      medium: 6
      hard: 2

  - id: "security_audit"
    description: "Detect OWASP Top 10 vulnerabilities in Python code"
    max_steps: 15
    scenarios: 10
    difficulty_distribution:
      easy: 1
      medium: 7
      hard: 2

  - id: "architectural_review"
    description: "Evaluate design patterns, coupling, and system constraints"
    max_steps: 20
    scenarios: 10
    difficulty_distribution:
      easy: 0
      medium: 7
      hard: 3

environment:
  noise_budget: 5
  line_tolerance_bug: 3
  line_tolerance_arch: 5
  keyword_match: "any"       # agent body must contain ANY listed keyword
  case_sensitive: false

grading:
  type: "deterministic"
  
  bug_detection:
    coverage_weight: 0.4
    avg_issue_score_weight: 0.6
    issue_score:
      keyword_weight: 0.5
      severity_weight: 0.5
    false_positive_penalty: 0.1

  security_audit:
    formula: "avg_issue_score"
    issue_score:
      severity_weight: 0.7
      keyword_weight: 0.3
    severity_scale:
      critical: 4
      high: 3
      medium: 2
      low: 1
      info: 0
    severity_penalty_per_level: 0.3

  architectural_review:
    issue_detection_weight: 0.6
    verdict_weight: 0.2
    quality_weight: 0.2
    quality_min_body_length: 20
    quality_max_body_length: 200