version: "1.0.0"
name: "agentorg-codereview"
owners: ["Arsh Verma", "Divyansh Rawat"]
description: >
  AI Senior Code Reviewer evaluation environment for CodeLens.
  Benchmarks agents on 30 synthetic pull requests across Bug Detection,
  Security Audit, and Architectural Review tasks.
  
entry_point: "app:app"
dashboard: "/dashboard"
api_docs: "/docs"
license: "MIT"
tags: ["code-review", "agentic-eval", "security-audit", "bug-detection"]
contact: "Arsh Verma <arsh@example.com>"

tasks:
  - id: "bug_detection"
    description: "Identify logical errors and edge cases in Python code"
    max_steps: 10
    scenarios: 10
    difficulty_distribution:
      easy: 2
      medium: 6
      hard: 2

  - id: "security_audit"
    description: "Detect OWASP Top 10 vulnerabilities in Python code"
    max_steps: 15
    scenarios: 10
    difficulty_distribution:
      easy: 1
      medium: 7
      hard: 2

  - id: "architectural_review"
    description: "Evaluate design patterns, coupling, and system constraints"
    max_steps: 20
    scenarios: 10
    difficulty_distribution:
      easy: 0
      medium: 7
      hard: 3

environment:
  noise_budget: 5
  line_tolerance_bug: 3
  line_tolerance_arch: 5
  keyword_match: "any"       # agent body must contain ANY listed keyword
  case_sensitive: false

grading:
  type: "deterministic"
  
  bug_detection:
    coverage_weight: 0.4
    avg_issue_score_weight: 0.6
    issue_score:
      keyword_weight: 0.5
      severity_weight: 0.5
    false_positive_penalty: 0.1

  security_audit:
    formula: "avg_issue_score"
    issue_score:
      severity_weight: 0.7
      keyword_weight: 0.3
    severity_scale:
      critical: 4
      high: 3
      medium: 2
      low: 1
      info: 0
    severity_penalty_per_level: 0.3

  architectural_review:
    issue_detection_weight: 0.6
    verdict_weight: 0.2
    quality_weight: 0.2
    quality_min_body_length: 20
    quality_max_body_length: 200