Spaces:

ArshVerma
/

CodeLens

Sleeping

File size: 1,934 Bytes

c1972ef
4b66647
3e1edbb
d8ee465
4b66647
 
 
 
cb3e1de
4b66647
 
f8670cd
 
 
cb3e1de
 
 
4b66647
cb3e1de
 
4b66647
 
 
 
d8ee465
cb3e1de
4b66647
cb3e1de
 
4b66647
 
 
 
d8ee465
cb3e1de
4b66647
cb3e1de
 
4b66647
 
 
 
 
 
 
 
 
 
 
cb3e1de
 
 
4b66647
 
d581a4f
4b66647
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8ee465
4b66647

version: "1.0.0"
name: "agentorg-codereview"
owners: ["Arsh Verma", "Divyansh Rawat"]
description: >
  AI Senior Code Reviewer evaluation environment for CodeLens.
  Benchmarks agents on 30 synthetic pull requests across Bug Detection,
  Security Audit, and Architectural Review tasks.
  
entry_point: "app:app"
dashboard: "/dashboard"
api_docs: "/docs"
license: "MIT"
tags: ["code-review", "agentic-eval", "security-audit", "bug-detection"]
contact: "Arsh Verma <arsh@example.com>"

tasks:
  - id: "bug_detection"
    description: "Identify logical errors and edge cases in Python code"
    max_steps: 10
    scenarios: 10
    difficulty_distribution:
      easy: 2
      medium: 6
      hard: 2

  - id: "security_audit"
    description: "Detect OWASP Top 10 vulnerabilities in Python code"
    max_steps: 15
    scenarios: 10
    difficulty_distribution:
      easy: 1
      medium: 7
      hard: 2

  - id: "architectural_review"
    description: "Evaluate design patterns, coupling, and system constraints"
    max_steps: 20
    scenarios: 10
    difficulty_distribution:
      easy: 0
      medium: 7
      hard: 3

environment:
  noise_budget: 5
  line_tolerance_bug: 3
  line_tolerance_arch: 5
  keyword_match: "any"       # agent body must contain ANY listed keyword
  case_sensitive: false

grading:
  type: "deterministic"
  
  bug_detection:
    coverage_weight: 0.4
    avg_issue_score_weight: 0.6
    issue_score:
      keyword_weight: 0.5
      severity_weight: 0.5
    false_positive_penalty: 0.1

  security_audit:
    formula: "avg_issue_score"
    issue_score:
      severity_weight: 0.7
      keyword_weight: 0.3
    severity_scale:
      critical: 4
      high: 3
      medium: 2
      low: 1
      info: 0
    severity_penalty_per_level: 0.3

  architectural_review:
    issue_detection_weight: 0.6
    verdict_weight: 0.2
    quality_weight: 0.2
    quality_min_body_length: 20
    quality_max_body_length: 200