| """LLM-as-a-Judge scoring for BioDesignBench Tier 2 evaluation. |
| |
| Provides cross-model LLM judge panels that evaluate subjective dimensions |
| (approach, orchestration, feasibility, novelty, diversity) while quality |
| metrics remain 100% algorithmic. |
| |
| Usage: |
| from llm_judge import LLMJudgePanel |
| |
| panel = LLMJudgePanel(agent_model_family="anthropic", dry_run=True) |
| result = panel.evaluate_sync( |
| task_description="Design a binder for IL-6R", |
| tool_call_log=[...], |
| designed_sequences=["MKVL..."], |
| algorithmic_metrics={"pLDDT": 82.5}, |
| ) |
| """ |
|
|
| from llm_judge.aggregation import ( |
| WEIGHT_SPLIT, |
| aggregate_judge_scores, |
| merge_algo_and_judge_scores, |
| split_algo_score, |
| ) |
| from llm_judge.judge import LLMJudge, parse_judge_response |
| from llm_judge.panel import ( |
| LLMJudgePanel, |
| detect_agent_family, |
| get_judge_models, |
| ) |
| from llm_judge.rubrics import ( |
| JUDGE_DIMENSIONS, |
| JUDGE_SYSTEM_PROMPT, |
| build_judge_prompt, |
| ) |
|
|
| __all__ = [ |
| "LLMJudge", |
| "LLMJudgePanel", |
| "JUDGE_DIMENSIONS", |
| "JUDGE_SYSTEM_PROMPT", |
| "WEIGHT_SPLIT", |
| "aggregate_judge_scores", |
| "build_judge_prompt", |
| "detect_agent_family", |
| "get_judge_models", |
| "merge_algo_and_judge_scores", |
| "parse_judge_response", |
| "split_algo_score", |
| ] |
|
|