""" DR-Bench Evaluation Pipeline. Provides LLM-as-judge coverage evaluation for agent outputs against ground truth pitch points extracted from success stories. Pipeline: 1. Extract ground truth pitch points from success stories (extract_pitch_points.py) 2. Run coverage judge to score agent outputs against GT (coverage_judge.py) 3. Compute weighted coverage scores (weighted_score.py) 4. Orchestrate evaluation via CoverageEvaluator (evaluator.py) """ from evaluation.evaluator import CoverageEvaluator, EvaluationResult from evaluation.weighted_score import calculate_score __all__ = ["CoverageEvaluator", "EvaluationResult", "calculate_score"]