File size: 4,519 Bytes
0d95482
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
adea8c3
0d95482
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python3
"""
Batch evaluation: runs all 30 scenarios and prints a summary report.
Usage: python scripts/evaluate.py --url http://localhost:7860 --agent keyword --output results.json
"""

import argparse
import sys
import json
import time
from pathlib import Path

# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent))

from scripts.baseline import KeywordAgent, LLMAgent, run_episode, save_results

TASKS = ["bug_detection", "security_audit", "architectural_review"]
SEEDS = list(range(10))

def run_batch_evaluation(url: str, agent, verbose: bool = False) -> list:
    """Run all 30 scenarios and return results."""
    all_results = []
    
    for task in TASKS:
        print(f"\n\u2500\u2500 Task: {task} \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500")
        for seed in SEEDS:
            try:
                result = run_episode(url, task, seed, agent, verbose)
                all_results.append(result)
                score = result["final_score"]
                bar = "\u2588" * int(score * 10) + "\u2591" * (10 - int(score * 10))
                print(f"  Seed {seed:2d}: [{bar}] {score:.3f}  ({result['issues_found']}/{result['issues_total']} issues)")
            except Exception as e:
                print(f"  Seed {seed:2d}: FAILED \u2014 {e}")
                all_results.append({"task_id": task, "seed": seed, "final_score": 0.0, "error": str(e)})
    
    return all_results

def print_summary(results: list):
    """Print a formatted summary report."""
    from collections import defaultdict
    import statistics
    
    print("\n" + "="*60)
    print("EVALUATION SUMMARY")
    print("="*60)
    
    by_task = defaultdict(list)
    for r in results:
        if "error" not in r:
            by_task[r["task_id"]].append(r["final_score"])
    
    overall_scores = [s for scores in by_task.values() for s in scores]
    
    for task, scores in by_task.items():
        if scores:
            print(f"\n{task.upper().replace('_', ' ')}")
            print(f"  Mean:   {statistics.mean(scores):.3f}")
            print(f"  Median: {statistics.median(scores):.3f}")
            print(f"  Stdev:  {statistics.stdev(scores) if len(scores) > 1 else 0:.3f}")
            print(f"  Best:   {max(scores):.3f}")
            print(f"  Worst:  {min(scores):.3f}")
    
    if overall_scores:
        print(f"\nOVERALL ({len(overall_scores)}/30 scenarios)")
        print(f"  Mean score: {statistics.mean(overall_scores):.3f}")
        print(f"  Success rate (>0.5): {sum(1 for s in overall_scores if s > 0.5)/len(overall_scores)*100:.1f}%")
    
    print("="*60)

def main():
    parser = argparse.ArgumentParser(description="Batch evaluation of all 30 CodeLens scenarios")
    parser.add_argument("--url", default="http://localhost:7860")
    parser.add_argument("--agent", default="keyword", choices=["keyword", "llm"])
    parser.add_argument("--api-key", default="")
    parser.add_argument("--output", default="results.json", help="Output file (.json or .csv)")
    parser.add_argument("--verbose", action="store_true")
    parser.add_argument("--task", default=None,
                        choices=["bug_detection", "security_audit", "architectural_review", None],
                        help="Run only a specific task (default: all)")
    args = parser.parse_args()
    
    if args.agent == "llm":
        import os
        api_key = args.api_key or os.environ.get("ANTHROPIC_API_KEY", "")
        if not api_key:
            print("ERROR: LLM agent requires --api-key or ANTHROPIC_API_KEY env var")
            sys.exit(1)
        agent = LLMAgent(api_key)
    else:
        agent = KeywordAgent()
    
    # Check connectivity
    try:
        import requests
        requests.get(f"{args.url}/health", timeout=5).raise_for_status()
    except Exception as e:
        print(f"ERROR: Cannot connect to {args.url}: {e}")
        sys.exit(1)
    
    global TASKS
    if args.task:
        TASKS = [args.task]
    
    print(f"Running evaluation: {len(TASKS)} task(s), {len(SEEDS)} seeds each")
    print(f"Agent: {args.agent} | API: {args.url}")
    start = time.time()
    
    results = run_batch_evaluation(args.url, agent, args.verbose)
    
    print(f"\nCompleted in {time.time()-start:.1f}s")
    print_summary(results)
    
    if args.output:
        save_results(results, args.output)
        print(f"\nResults saved to: {args.output}")

if __name__ == "__main__":
    main()