"""Standalone 100-point scoring rubric for BioDesignBench Tier 2 design tasks. This file is a **self-contained extraction** of the scoring logic from the ``biodesignbench`` package. It has **zero external dependencies** (stdlib only) so it can run on HuggingFace Spaces without installing the full package. Modules consolidated: - biodesignbench/taxonomy.py - biodesignbench/eval/metrics/sequence.py - biodesignbench/eval/metrics/approach.py - biodesignbench/eval/metrics/orchestration.py - biodesignbench/eval/tier2/scoring.py - biodesignbench/eval/tier2/oracle.py (oracle loading stub) Six scoring components (sum = 100): approach (20 pts) — Tool/methodology selection orchestration (15 pts) — Pipeline ordering + intermediate validation quality (35 pts) — 3-tier continuous scoring (structure/interface/physics) feasibility (15 pts) — Valid AAs, length, composition + biophysical checks novelty ( 5 pts) — Sequence identity to known sequences diversity (10 pts) — Number + diversity of designs """ from __future__ import annotations import json import math import re from collections import Counter from dataclasses import dataclass, field from enum import Enum from functools import lru_cache from itertools import combinations from typing import Any, Optional # ═══════════════════════════════════════════════════════════════════════════════ # SECTION 1 — Taxonomy (from biodesignbench/taxonomy.py) # ═══════════════════════════════════════════════════════════════════════════════ class DesignTaskType(str, Enum): """What the agent does.""" DE_NOVO_BINDER = "de_novo_binder" SEQUENCE_OPTIMIZATION = "sequence_optimization" DE_NOVO_BACKBONE = "de_novo_backbone" COMPLEX_ENGINEERING = "complex_engineering" CONFORMATIONAL_DESIGN = "conformational_design" @property def short(self) -> str: return _TASK_TYPE_SHORT[self] class BiologicalContext(str, Enum): """Domain knowledge required.""" ANTIBODY = "antibody" ENZYME = "enzyme" SIGNALING = "signaling" STRUCTURAL = "structural" FLUORESCENT = "fluorescent" THERAPEUTIC = "therapeutic" @property def short(self) -> str: return _CONTEXT_SHORT[self] _TASK_TYPE_SHORT: dict[DesignTaskType, str] = { DesignTaskType.DE_NOVO_BINDER: "dnb", DesignTaskType.SEQUENCE_OPTIMIZATION: "sqo", DesignTaskType.DE_NOVO_BACKBONE: "dnk", DesignTaskType.COMPLEX_ENGINEERING: "cpx", DesignTaskType.CONFORMATIONAL_DESIGN: "cfd", } _CONTEXT_SHORT: dict[BiologicalContext, str] = { BiologicalContext.ANTIBODY: "ab", BiologicalContext.ENZYME: "enz", BiologicalContext.SIGNALING: "sig", BiologicalContext.STRUCTURAL: "str", BiologicalContext.FLUORESCENT: "flu", BiologicalContext.THERAPEUTIC: "thr", } _SHORT_TO_TASK_TYPE: dict[str, DesignTaskType] = {v: k for k, v in _TASK_TYPE_SHORT.items()} _SHORT_TO_CONTEXT: dict[str, BiologicalContext] = {v: k for k, v in _CONTEXT_SHORT.items()} # Core tools expected per task type _CORE_TOOLS: dict[DesignTaskType, list[str]] = { DesignTaskType.DE_NOVO_BINDER: ["rfdiffusion", "proteinmpnn", "alphafold2"], DesignTaskType.SEQUENCE_OPTIMIZATION: ["proteinmpnn", "esmfold", "alphafold2"], DesignTaskType.DE_NOVO_BACKBONE: ["rfdiffusion", "proteinmpnn", "alphafold2"], DesignTaskType.COMPLEX_ENGINEERING: ["rfdiffusion", "proteinmpnn", "alphafold2"], DesignTaskType.CONFORMATIONAL_DESIGN: ["esmfold", "proteinmpnn", "alphafold2"], } _PRIMARY_METRIC: dict[DesignTaskType, str] = { DesignTaskType.DE_NOVO_BINDER: "ipTM", DesignTaskType.SEQUENCE_OPTIMIZATION: "pLDDT", DesignTaskType.DE_NOVO_BACKBONE: "pLDDT", DesignTaskType.COMPLEX_ENGINEERING: "ipTM", DesignTaskType.CONFORMATIONAL_DESIGN: "pLDDT", } @dataclass(frozen=True) class TaskCategory: """A valid cell in the DesignTaskType × BiologicalContext matrix.""" task_type: DesignTaskType context: BiologicalContext @property def category_id(self) -> str: return f"{self.task_type.short}_{self.context.short}" @property def expected_core_tools(self) -> list[str]: return list(_CORE_TOOLS[self.task_type]) @property def primary_quality_metric(self) -> str: return _PRIMARY_METRIC[self.task_type] VALID_CATEGORIES: list[TaskCategory] = [ # de_novo_binder (4) TaskCategory(DesignTaskType.DE_NOVO_BINDER, BiologicalContext.ANTIBODY), TaskCategory(DesignTaskType.DE_NOVO_BINDER, BiologicalContext.ENZYME), TaskCategory(DesignTaskType.DE_NOVO_BINDER, BiologicalContext.SIGNALING), TaskCategory(DesignTaskType.DE_NOVO_BINDER, BiologicalContext.THERAPEUTIC), # sequence_optimization (5) TaskCategory(DesignTaskType.SEQUENCE_OPTIMIZATION, BiologicalContext.ANTIBODY), TaskCategory(DesignTaskType.SEQUENCE_OPTIMIZATION, BiologicalContext.ENZYME), TaskCategory(DesignTaskType.SEQUENCE_OPTIMIZATION, BiologicalContext.SIGNALING), TaskCategory(DesignTaskType.SEQUENCE_OPTIMIZATION, BiologicalContext.STRUCTURAL), TaskCategory(DesignTaskType.SEQUENCE_OPTIMIZATION, BiologicalContext.FLUORESCENT), # de_novo_backbone (1) TaskCategory(DesignTaskType.DE_NOVO_BACKBONE, BiologicalContext.STRUCTURAL), # complex_engineering (3) TaskCategory(DesignTaskType.COMPLEX_ENGINEERING, BiologicalContext.ENZYME), TaskCategory(DesignTaskType.COMPLEX_ENGINEERING, BiologicalContext.SIGNALING), TaskCategory(DesignTaskType.COMPLEX_ENGINEERING, BiologicalContext.STRUCTURAL), # conformational_design (4) TaskCategory(DesignTaskType.CONFORMATIONAL_DESIGN, BiologicalContext.ENZYME), TaskCategory(DesignTaskType.CONFORMATIONAL_DESIGN, BiologicalContext.SIGNALING), TaskCategory(DesignTaskType.CONFORMATIONAL_DESIGN, BiologicalContext.STRUCTURAL), TaskCategory(DesignTaskType.CONFORMATIONAL_DESIGN, BiologicalContext.FLUORESCENT), ] _CATEGORY_BY_ID: dict[str, TaskCategory] = {c.category_id: c for c in VALID_CATEGORIES} # OLD → NEW task ID mapping (30 tasks) OLD_TO_NEW_MAPPING: dict[str, str] = { "binder_001": "dnb_sig_001", "binder_003": "dnb_sig_002", "binder_005": "dnb_sig_003", "binder_007": "dnb_sig_004", "ppi_004": "dnb_sig_005", "binder_002": "dnb_thr_001", "binder_006": "dnb_thr_002", "binder_008": "dnb_thr_003", "peptide_001": "dnb_thr_004", "peptide_002": "dnb_thr_005", "peptide_003": "dnb_thr_006", "antibody_001": "sqo_ab_001", "antibody_002": "sqo_ab_002", "antibody_003": "sqo_ab_003", "antibody_004": "sqo_ab_004", "antibody_005": "sqo_ab_005", "stability_002": "sqo_enz_001", "enzyme_001": "sqo_enz_002", "enzyme_002": "sqo_enz_003", "enzyme_003": "sqo_enz_004", "stability_003": "sqo_str_001", "stability_004": "sqo_str_002", "stability_001": "sqo_flu_001", "scaffold_001": "dnk_str_001", "scaffold_002": "dnk_str_002", "scaffold_003": "dnk_str_003", "ppi_001": "cpx_str_001", "ppi_002": "cpx_str_002", "ppi_003": "cfd_sig_001", "fluorescence_001": "cfd_flu_001", } _NEW_TO_OLD_MAPPING: dict[str, str] = {v: k for k, v in OLD_TO_NEW_MAPPING.items()} _NEW_ID_RE = re.compile(r"^([a-z]{2,3})_([a-z]{2,3})_(\d{3})$") _OLD_TYPE_TO_CANONICAL: dict[str, str] = { "binder": "de_novo_binder", "antibody": "de_novo_binder", "peptide": "de_novo_binder", "stability": "sequence_optimization", "enzyme": "sequence_optimization", "fluorescence": "sequence_optimization", "scaffold": "de_novo_backbone", "ppi": "complex_engineering", } _CANONICAL_VALUES = {e.value for e in DesignTaskType} def get_category(task_id: str) -> Optional[TaskCategory]: """Get the TaskCategory for a task ID (old or new format).""" if task_id in OLD_TO_NEW_MAPPING: new_id = OLD_TO_NEW_MAPPING[task_id] cat_id = new_id.rsplit("_", 1)[0] return _CATEGORY_BY_ID.get(cat_id) m = _NEW_ID_RE.match(task_id) if m: cat_id = f"{m.group(1)}_{m.group(2)}" return _CATEGORY_BY_ID.get(cat_id) return None def get_new_task_id(old_task_id: str) -> Optional[str]: return OLD_TO_NEW_MAPPING.get(old_task_id) def get_old_task_id(new_task_id: str) -> Optional[str]: return _NEW_TO_OLD_MAPPING.get(new_task_id) def is_valid_category(task_type: DesignTaskType, context: BiologicalContext) -> bool: cat_id = f"{task_type.short}_{context.short}" return cat_id in _CATEGORY_BY_ID def parse_new_task_id( task_id: str, ) -> Optional[tuple[DesignTaskType, BiologicalContext, int]]: m = _NEW_ID_RE.match(task_id) if not m: return None task_short, ctx_short, num_str = m.group(1), m.group(2), m.group(3) task_type = _SHORT_TO_TASK_TYPE.get(task_short) context = _SHORT_TO_CONTEXT.get(ctx_short) if task_type is None or context is None: return None if not is_valid_category(task_type, context): return None return task_type, context, int(num_str) def normalize_task_type(task_type: str) -> str: lower = task_type.lower().strip() if lower in _CANONICAL_VALUES: return lower return _OLD_TYPE_TO_CANONICAL.get(lower, task_type) # ═══════════════════════════════════════════════════════════════════════════════ # SECTION 2 — Sequence Metrics (from biodesignbench/eval/metrics/sequence.py) # ═══════════════════════════════════════════════════════════════════════════════ _KD_SCALE: dict[str, float] = { "A": 1.8, "C": 2.5, "D": -3.5, "E": -3.5, "F": 2.8, "G": -0.4, "H": -3.2, "I": 4.5, "K": -3.9, "L": 3.8, "M": 1.9, "N": -3.5, "P": -1.6, "Q": -3.5, "R": -4.5, "S": -0.8, "T": -0.7, "V": 4.2, "W": -0.9, "Y": -1.3, } STANDARD_AAS = set("ACDEFGHIKLMNPQRSTVWY") def sequence_identity(seq1: str, seq2: str) -> float: """Compute fractional sequence identity between two sequences.""" if not seq1 or not seq2: return 0.0 s1, s2 = seq1.upper(), seq2.upper() if len(s1) == len(s2): return sum(a == b for a, b in zip(s1, s2)) / len(s1) short, long = (s1, s2) if len(s1) <= len(s2) else (s2, s1) best = 0.0 for offset in range(len(long) - len(short) + 1): matches = sum(a == b for a, b in zip(short, long[offset:offset + len(short)])) identity = matches / len(short) if identity > best: best = identity return best def max_identity_to_reference(designs: list[str], reference: str) -> float: if not designs or not reference: return 0.0 return max(sequence_identity(d, reference) for d in designs) def mean_pairwise_diversity(sequences: list[str]) -> float: if len(sequences) < 2: return 0.0 total = 0.0 count = 0 for s1, s2 in combinations(sequences, 2): total += 1.0 - sequence_identity(s1, s2) count += 1 return total / count if count > 0 else 0.0 def sequence_entropy(sequences: list[str], truncate: bool = False) -> float: if len(sequences) < 2: return 0.0 lengths = {len(s) for s in sequences} if len(lengths) != 1: if not truncate: return 0.0 seq_len = min(lengths) sequences = [s[:seq_len] for s in sequences] else: seq_len = lengths.pop() if seq_len == 0: return 0.0 n = len(sequences) total_entropy = 0.0 for pos in range(seq_len): counts: dict[str, int] = {} for seq in sequences: aa = seq[pos].upper() counts[aa] = counts.get(aa, 0) + 1 pos_entropy = 0.0 for count in counts.values(): if count > 0: p = count / n pos_entropy -= p * math.log(p) total_entropy += pos_entropy / math.log(20) return total_entropy / seq_len def validate_amino_acids(sequence: str) -> dict: if not sequence or not sequence.strip(): return {"valid": False, "invalid_chars": set(), "fraction_valid": 0.0} upper = sequence.upper() chars = set(upper) invalid = chars - STANDARD_AAS valid_count = sum(1 for c in upper if c in STANDARD_AAS) return { "valid": len(invalid) == 0, "invalid_chars": invalid, "fraction_valid": valid_count / len(upper), } def check_length_constraints( sequence: str, length_range: tuple[int, int] | None, ) -> dict: length = len(sequence) if length_range is None: return {"length": length, "within_range": True, "range": None} min_len, max_len = length_range return { "length": length, "within_range": min_len <= length <= max_len, "range": length_range, } def hydrophobicity_profile(sequence: str) -> dict: if not sequence: return {"mean": 0.0, "std": 0.0, "fraction_hydrophobic": 0.0, "min": 0.0, "max": 0.0} values = [_KD_SCALE.get(aa.upper(), 0.0) for aa in sequence] n = len(values) mean = sum(values) / n variance = sum((v - mean) ** 2 for v in values) / n std = math.sqrt(variance) hydrophobic_count = sum(1 for v in values if v > 0) return { "mean": round(mean, 3), "std": round(std, 3), "fraction_hydrophobic": round(hydrophobic_count / n, 3), "min": round(min(values), 3), "max": round(max(values), 3), } def count_mutations(wt: str, designed: str) -> int: if len(wt) != len(designed): return -1 return sum(a != b for a, b in zip(wt.upper(), designed.upper())) # ═══════════════════════════════════════════════════════════════════════════════ # SECTION 3 — Approach Scoring (from biodesignbench/eval/metrics/approach.py) # ═══════════════════════════════════════════════════════════════════════════════ class DesignFunction(str, Enum): """Functional capabilities that tools provide.""" BACKBONE_GENERATION = "backbone_generation" SEQUENCE_DESIGN = "sequence_design" STRUCTURE_PREDICTION = "structure_prediction" COMPLEX_PREDICTION = "complex_prediction" INTERFACE_ANALYSIS = "interface_analysis" STABILITY_SCORING = "stability_scoring" ENERGY_MINIMIZATION = "energy_minimization" HOTSPOT_IDENTIFICATION = "hotspot_identification" SEQUENCE_SCORING = "sequence_scoring" PHYSICS_VALIDATION = "physics_validation" TOOL_CATEGORIES: dict[str, str] = { "alphafold2": "structure_prediction", "alphafold": "structure_prediction", "af2": "structure_prediction", "esmfold": "structure_prediction", "openfold": "structure_prediction", "boltz": "structure_prediction", "colabfold": "structure_prediction", "omegafold": "structure_prediction", "rosettafold": "structure_prediction", "proteinmpnn": "sequence_design", "mpnn": "sequence_design", "esm_if": "sequence_design", "ligandmpnn": "sequence_design", "rfdiffusion": "backbone_generation", "rfdiff": "backbone_generation", "chroma": "backbone_generation", "framediff": "backbone_generation", "foldingdiff": "backbone_generation", "rosetta": "energy_optimization", "pyrosetta": "energy_optimization", "foldx": "energy_optimization", "openmm": "energy_optimization", "amber": "energy_optimization", "esm2": "energy_optimization", "foldseek": "structure_search", "dali": "structure_search", "tmalign": "structure_search", } MCP_TOOL_EXPANSION: dict[str, list[str]] = { "design_binder": ["rfdiffusion", "proteinmpnn", "esmfold"], "validate_design": ["esmfold", "alphafold2"], "optimize_sequence": ["proteinmpnn"], "predict_complex": ["alphafold2"], "analyze_interface": ["pyrosetta"], "predict_structure": ["esmfold", "alphafold2"], "score_stability": ["esm2"], "energy_minimize": ["openmm"], "suggest_hotspots": [], "get_design_status": [], "generate_backbone": ["rfdiffusion"], "rosetta_score": ["pyrosetta"], "rosetta_relax": ["pyrosetta"], "rosetta_interface_score": ["pyrosetta"], "rosetta_design": ["pyrosetta"], "predict_structure_boltz": ["boltz"], "predict_affinity_boltz": ["boltz"], } TOOL_TO_FUNCTION: dict[str, set[DesignFunction]] = { # MCP wrappers "design_binder": {DesignFunction.BACKBONE_GENERATION, DesignFunction.SEQUENCE_DESIGN, DesignFunction.STRUCTURE_PREDICTION}, "validate_design": {DesignFunction.STRUCTURE_PREDICTION}, "optimize_sequence": {DesignFunction.SEQUENCE_DESIGN}, "predict_complex": {DesignFunction.COMPLEX_PREDICTION, DesignFunction.STRUCTURE_PREDICTION}, "analyze_interface": {DesignFunction.INTERFACE_ANALYSIS}, "predict_structure": {DesignFunction.STRUCTURE_PREDICTION}, "score_stability": {DesignFunction.STABILITY_SCORING}, "energy_minimize": {DesignFunction.ENERGY_MINIMIZATION}, "suggest_hotspots": {DesignFunction.HOTSPOT_IDENTIFICATION}, "get_design_status": set(), "generate_backbone": {DesignFunction.BACKBONE_GENERATION}, "rosetta_score": {DesignFunction.PHYSICS_VALIDATION}, "rosetta_relax": {DesignFunction.ENERGY_MINIMIZATION}, "rosetta_interface_score": {DesignFunction.INTERFACE_ANALYSIS}, "rosetta_design": {DesignFunction.SEQUENCE_DESIGN}, "predict_structure_boltz": {DesignFunction.STRUCTURE_PREDICTION}, "predict_affinity_boltz": {DesignFunction.COMPLEX_PREDICTION, DesignFunction.INTERFACE_ANALYSIS}, # Bio-level tools "rfdiffusion": {DesignFunction.BACKBONE_GENERATION}, "proteinmpnn": {DesignFunction.SEQUENCE_DESIGN}, "alphafold2": {DesignFunction.STRUCTURE_PREDICTION, DesignFunction.COMPLEX_PREDICTION}, "alphafold": {DesignFunction.STRUCTURE_PREDICTION, DesignFunction.COMPLEX_PREDICTION}, "esmfold": {DesignFunction.STRUCTURE_PREDICTION}, "esm2": {DesignFunction.STABILITY_SCORING, DesignFunction.SEQUENCE_SCORING}, "pyrosetta": {DesignFunction.ENERGY_MINIMIZATION, DesignFunction.PHYSICS_VALIDATION, DesignFunction.INTERFACE_ANALYSIS}, "rosetta": {DesignFunction.ENERGY_MINIMIZATION, DesignFunction.PHYSICS_VALIDATION, DesignFunction.INTERFACE_ANALYSIS}, "openmm": {DesignFunction.ENERGY_MINIMIZATION}, "boltz": {DesignFunction.STRUCTURE_PREDICTION, DesignFunction.COMPLEX_PREDICTION}, "foldx": {DesignFunction.STABILITY_SCORING, DesignFunction.PHYSICS_VALIDATION}, "colabfold": {DesignFunction.STRUCTURE_PREDICTION, DesignFunction.COMPLEX_PREDICTION}, "foldseek": {DesignFunction.STRUCTURE_PREDICTION}, "chroma": {DesignFunction.BACKBONE_GENERATION}, "ligandmpnn": {DesignFunction.SEQUENCE_DESIGN}, "esm_if": {DesignFunction.SEQUENCE_DESIGN}, "mpnn": {DesignFunction.SEQUENCE_DESIGN}, } class _TaskTypeDict(dict): """Dict that accepts both DesignTaskType enum and string keys.""" def __init__(self, raw: dict[str, set[DesignFunction]]): super().__init__() self._raw = raw for k, v in raw.items(): super().__setitem__(k, v) def __contains__(self, key): k = key.value if hasattr(key, "value") else key return super().__contains__(k) def __getitem__(self, key): k = key.value if hasattr(key, "value") else key return super().__getitem__(k) def get(self, key, default=None): k = key.value if hasattr(key, "value") else key return super().get(k, default) REQUIRED_FUNCTIONS = _TaskTypeDict({ "de_novo_binder": {DesignFunction.BACKBONE_GENERATION, DesignFunction.SEQUENCE_DESIGN, DesignFunction.STRUCTURE_PREDICTION}, "sequence_optimization": {DesignFunction.SEQUENCE_DESIGN, DesignFunction.STRUCTURE_PREDICTION}, "de_novo_backbone": {DesignFunction.BACKBONE_GENERATION, DesignFunction.SEQUENCE_DESIGN, DesignFunction.STRUCTURE_PREDICTION}, "complex_engineering": {DesignFunction.SEQUENCE_DESIGN, DesignFunction.COMPLEX_PREDICTION}, "conformational_design": {DesignFunction.SEQUENCE_DESIGN, DesignFunction.STRUCTURE_PREDICTION}, }) BONUS_FUNCTIONS = _TaskTypeDict({ "de_novo_binder": {DesignFunction.COMPLEX_PREDICTION, DesignFunction.INTERFACE_ANALYSIS, DesignFunction.ENERGY_MINIMIZATION, DesignFunction.HOTSPOT_IDENTIFICATION}, "sequence_optimization": {DesignFunction.STABILITY_SCORING, DesignFunction.ENERGY_MINIMIZATION, DesignFunction.PHYSICS_VALIDATION}, "de_novo_backbone": {DesignFunction.ENERGY_MINIMIZATION, DesignFunction.PHYSICS_VALIDATION}, "complex_engineering": {DesignFunction.BACKBONE_GENERATION, DesignFunction.INTERFACE_ANALYSIS, DesignFunction.ENERGY_MINIMIZATION, DesignFunction.STRUCTURE_PREDICTION}, "conformational_design": {DesignFunction.STABILITY_SCORING, DesignFunction.ENERGY_MINIMIZATION, DesignFunction.COMPLEX_PREDICTION}, }) _GENERATION_TOOLS: set[str] = { "rfdiffusion", "proteinmpnn", "design_binder", "optimize_sequence", "generate_backbone", "rosetta_design", "chroma", "ligandmpnn", "esm_if", "mpnn", } _VALIDATION_TOOLS: set[str] = { "esmfold", "alphafold2", "validate_design", "predict_structure", "predict_complex", "score_stability", "rosetta_score", "rosetta_interface_score", "predict_structure_boltz", "predict_affinity_boltz", "analyze_interface", } _REFINEMENT_TOOLS: set[str] = { "energy_minimize", "rosetta_relax", "openmm", "pyrosetta", "rosetta", } def expand_mcp_tools(tools: list[str]) -> list[str]: """Expand MCP wrapper tool names to their underlying bio tools.""" seen: set[str] = set() expanded: list[str] = [] for tool in tools: if tool in MCP_TOOL_EXPANSION: underlying = MCP_TOOL_EXPANSION[tool] if not underlying: if tool not in seen: expanded.append(tool) seen.add(tool) else: for ut in underlying: if ut not in seen: expanded.append(ut) seen.add(ut) else: if tool not in seen: expanded.append(tool) seen.add(tool) return expanded def normalize_tool_name(tool: str) -> str: return tool.lower().strip().replace(" ", "").replace("-", "").replace("_", "") def get_tool_category(tool: str) -> str | None: normalized = normalize_tool_name(tool) for name, category in TOOL_CATEGORIES.items(): if normalize_tool_name(name) == normalized: return category return None def _extract_functions_from_tools(tools: list[str]) -> set[DesignFunction]: functions: set[DesignFunction] = set() for tool in tools: if tool in TOOL_TO_FUNCTION: functions.update(TOOL_TO_FUNCTION[tool]) else: norm = normalize_tool_name(tool) for known, funcs in TOOL_TO_FUNCTION.items(): if normalize_tool_name(known) == norm: functions.update(funcs) break return functions def _check_validation(tools_used: list[str]) -> float: if not tools_used: return 0.0 has_generation = False has_validation_after_generation = False has_any_validation = False for tool in tools_used: if tool in _GENERATION_TOOLS: has_generation = True if tool in _VALIDATION_TOOLS: has_any_validation = True if has_generation: has_validation_after_generation = True if has_validation_after_generation: return 4.0 if has_any_validation: return 2.0 return 0.0 def _check_refinement(tools_used: list[str]) -> float: if not tools_used: return 0.0 for tool in tools_used: if tool in _REFINEMENT_TOOLS: return 4.0 counts = Counter(tools_used) for tool, count in counts.items(): if count >= 2 and (tool in _GENERATION_TOOLS or tool in _VALIDATION_TOOLS): return 4.0 return 0.0 def _score_approach_legacy( tools_used: list[str], tools_expected: list[str], max_points: int = 20, ) -> dict: if not tools_expected: return { "score": max_points, "max": max_points, "breakdown": [], "tools_matched": [], "tools_missing": [], "mode": "legacy", } expanded_used = expand_mcp_tools(tools_used) per_tool = max_points / len(tools_expected) used_normalized = [normalize_tool_name(t) for t in expanded_used] used_categories = [get_tool_category(t) for t in expanded_used] total = 0.0 breakdown = [] matched = [] missing = [] for expected in tools_expected: expected_norm = normalize_tool_name(expected) expected_cat = get_tool_category(expected) if expected_norm in used_normalized: total += per_tool breakdown.append({"tool": expected, "match": "exact", "points": per_tool}) matched.append(expected) elif expected_cat and expected_cat in used_categories: points = per_tool * 0.7 total += points breakdown.append({"tool": expected, "match": "category", "points": points}) matched.append(expected) else: breakdown.append({"tool": expected, "match": "none", "points": 0}) missing.append(expected) return { "score": int(round(total)), "max": max_points, "breakdown": breakdown, "tools_matched": matched, "tools_missing": missing, "mode": "legacy", } def score_approach( tools_used: list[str], tools_expected: list[str], max_points: int = 20, task_type: DesignTaskType | str | None = None, ) -> dict: """Score the agent's tool/methodology selection.""" if task_type is None: return _score_approach_legacy(tools_used, tools_expected, max_points) tt_key = task_type.value if hasattr(task_type, "value") else str(task_type) scale = max_points / 20.0 func_max = 12.0 * scale agent_functions = _extract_functions_from_tools(tools_used) required = REQUIRED_FUNCTIONS.get(tt_key, set()) bonus = BONUS_FUNCTIONS.get(tt_key, set()) if required: covered_required = agent_functions & required required_ratio = len(covered_required) / len(required) else: required_ratio = 1.0 if agent_functions else 0.0 covered_required = set() covered_bonus = agent_functions & bonus bonus_count = min(len(covered_bonus), 3) func_score = (required_ratio * 9.0 + bonus_count * 1.0) * scale func_score = min(func_score, func_max) val_score = _check_validation(tools_used) * scale ref_score = _check_refinement(tools_used) * scale total = min(func_score + val_score + ref_score, float(max_points)) return { "score": int(round(total)), "max": max_points, "mode": "function", "function_coverage": round(func_score, 1), "validation_inclusion": round(val_score, 1), "iterative_refinement": round(ref_score, 1), "required_functions": sorted(f.value for f in required), "covered_required": sorted(f.value for f in covered_required), "covered_bonus": sorted(f.value for f in covered_bonus), "agent_functions": sorted(f.value for f in agent_functions), } # ═══════════════════════════════════════════════════════════════════════════════ # SECTION 4 — Orchestration Scoring (from biodesignbench/eval/metrics/orchestration.py) # ═══════════════════════════════════════════════════════════════════════════════ EXPECTED_PIPELINES: dict[str, list[str]] = { "de_novo_binder": ["rfdiffusion", "proteinmpnn", "esmfold"], "sequence_optimization": ["proteinmpnn", "esmfold"], "de_novo_backbone": ["rfdiffusion", "proteinmpnn", "esmfold"], "complex_engineering": ["rfdiffusion", "proteinmpnn", "esmfold"], "conformational_design": ["proteinmpnn", "esmfold"], # Old category names (backward compat) "binder": ["rfdiffusion", "proteinmpnn", "esmfold"], "antibody": ["proteinmpnn", "esmfold"], "stability": ["proteinmpnn", "esmfold"], "enzyme": ["rfdiffusion", "proteinmpnn", "esmfold"], } ORCHESTRATION_VALIDATION_TOOLS: set[str] = { "validate_design", "predict_complex", "analyze_interface", "esmfold", "score_stability", "rosetta_score", "rosetta_interface_score", "predict_structure_boltz", "predict_affinity_boltz", } def _expand_tool_name(tool: str) -> list[str]: if tool in MCP_TOOL_EXPANSION: underlying = MCP_TOOL_EXPANSION[tool] return underlying if underlying else [tool] return [tool] def _extract_ordered_bio_tools(tool_call_log: list[dict[str, Any]]) -> list[str]: utility_tools = {"execute_python", "read_file", "write_file"} ordered: list[str] = [] for entry in tool_call_log: tool = entry.get("tool", "") if tool in utility_tools: continue expanded = _expand_tool_name(tool) for t in expanded: ordered.append(normalize_tool_name(t)) return ordered def _longest_ordered_subsequence_length( actual: list[str], expected: list[str] ) -> int: if not expected or not actual: return 0 j = 0 matched = 0 for tool in actual: k = j while k < len(expected): if tool == normalize_tool_name(expected[k]): matched += 1 j = k + 1 break k += 1 return matched def _count_validation_steps(tool_call_log: list[dict[str, Any]]) -> int: count = 0 for entry in tool_call_log: tool = entry.get("tool", "") if tool in ORCHESTRATION_VALIDATION_TOOLS: count += 1 expanded = _expand_tool_name(tool) for t in expanded: if t in ORCHESTRATION_VALIDATION_TOOLS and tool not in ORCHESTRATION_VALIDATION_TOOLS: count += 1 return count def _has_adaptive_behavior(tool_call_log: list[dict[str, Any]]) -> bool: tool_args: dict[str, list[dict]] = {} for entry in tool_call_log: tool = entry.get("tool", "") args = entry.get("args_summary", {}) if tool not in tool_args: tool_args[tool] = [] tool_args[tool].append(args) for tool, args_list in tool_args.items(): if len(args_list) >= 2: for i in range(1, len(args_list)): if args_list[i] != args_list[i - 1]: return True return False def _get_task_category_for_orchestration(task_id: str) -> str | None: """Extract category from task_id using taxonomy, with legacy fallback.""" category = get_category(task_id) if category is not None: return category.task_type.value for cat in ("binder", "antibody", "stability", "enzyme"): if task_id.startswith(cat): return cat return None def score_orchestration( tool_call_log: list[dict[str, Any]], task_id: str, max_points: int = 15, ) -> dict[str, Any]: """Score the agent's multi-step pipeline orchestration.""" if not tool_call_log: return { "score": 0, "max": max_points, "pipeline_order_score": 0.0, "validation_score": 0.0, "adaptive_score": 0.0, "details": "No tool calls recorded", } category = _get_task_category_for_orchestration(task_id) expected_pipeline = EXPECTED_PIPELINES.get(category, []) ordered_tools = _extract_ordered_bio_tools(tool_call_log) if expected_pipeline: matched = _longest_ordered_subsequence_length(ordered_tools, expected_pipeline) order_ratio = matched / len(expected_pipeline) else: order_ratio = 1.0 if ordered_tools else 0.0 pipeline_points = order_ratio * max_points * 0.5 validation_count = _count_validation_steps(tool_call_log) if validation_count >= 2: validation_ratio = 1.0 elif validation_count == 1: validation_ratio = 0.6 else: validation_ratio = 0.0 validation_points = validation_ratio * max_points * 0.3 adaptive = _has_adaptive_behavior(tool_call_log) adaptive_points = max_points * 0.2 if adaptive else 0.0 total = int(round(pipeline_points + validation_points + adaptive_points)) return { "score": min(total, max_points), "max": max_points, "pipeline_order_score": round(pipeline_points, 1), "validation_score": round(validation_points, 1), "adaptive_score": round(adaptive_points, 1), "expected_pipeline": expected_pipeline, "actual_tool_order": ordered_tools, "validation_steps": validation_count, "adaptive_behavior": adaptive, } # ═══════════════════════════════════════════════════════════════════════════════ # SECTION 5 — Quality + Scoring (from biodesignbench/eval/tier2/scoring.py) # ═══════════════════════════════════════════════════════════════════════════════ DEFAULT_DESIGN_RUBRIC = { "approach": 20, "orchestration": 15, "quality": 35, "feasibility": 15, "novelty": 5, "diversity": 10, } METRIC_RANGES: dict[str, tuple[float, float]] = { "pLDDT": (0, 100), "pTM": (0, 1), "ipTM": (0, 1), "i_pAE": (0, 50), "predicted_kd": (0, 1e6), "predicted_ddG": (-100, 100), "active_site_rmsd": (0, 50), "max_sequence_identity": (0, 1), "TM_score": (0, 1), } THRESHOLD_TO_METRIC: dict[str, tuple[str, str]] = { "pLDDT_good": ("pLDDT", "higher_is_better"), "ipTM_good": ("ipTM", "higher_is_better"), "kd_nM_good": ("predicted_kd", "lower_is_better"), "predicted_ddG_good": ("predicted_ddG", "lower_is_better"), "active_site_rmsd_good": ("active_site_rmsd", "lower_is_better"), } # Tier A: Structure Confidence _TIER_A_THRESHOLDS: dict[str, dict[str, float]] = { "pLDDT": {"pass": 65, "good": 80, "excellent": 90}, "pTM": {"pass": 0.45, "good": 0.65, "excellent": 0.80}, } # Tier B: Interface Confidence (binding only) _TIER_B_THRESHOLDS: dict[str, dict[str, float]] = { "ipTM": {"pass": 0.15, "good": 0.40, "excellent": 0.70}, "i_pAE": {"pass": 25.0, "good": 15.0, "excellent": 8.0}, } _TIER_B_DIRECTIONS: dict[str, str] = {"i_pAE": "lower_is_better"} # Tier C: Interface Physics _TIER_C_METRICS: dict[str, tuple[str, str]] = { "kd_nM_good": ("predicted_kd", "lower_is_better"), "predicted_ddG_good": ("predicted_ddG", "lower_is_better"), "active_site_rmsd_good": ("active_site_rmsd", "lower_is_better"), } _TIER_C_PHYSICS: dict[str, dict[str, float]] = { "buried_surface_area": {"pass": 800, "good": 1500, "excellent": 2500}, "hydrogen_bonds": {"pass": 5, "good": 15, "excellent": 30}, } _TIER_A_BASE = 15 _TIER_B_BASE = 10 _TIER_C_BASE = 10 _QUALITY_BASE = _TIER_A_BASE + _TIER_B_BASE + _TIER_C_BASE # 35 _BINDING_TASK_TYPES: set[DesignTaskType] = { DesignTaskType.DE_NOVO_BINDER, DesignTaskType.COMPLEX_ENGINEERING, } _BINDING_OLD_PREFIXES: set[str] = {"binder", "antibody", "ppi", "peptide"} def _is_binding_task(task_id: str | None) -> bool: if not task_id: return False cat = get_category(task_id) if cat is not None: return cat.task_type in _BINDING_TASK_TYPES prefix = task_id.split("_")[0] return prefix in _BINDING_OLD_PREFIXES def _get_tier_weights( task_id: str | None = None, max_points: int = 35, ) -> tuple[int, int, int]: if not task_id: scale = max_points / _QUALITY_BASE if _QUALITY_BASE > 0 else 0 return ( int(round(_TIER_A_BASE * scale)), int(round(_TIER_B_BASE * scale)), int(round(_TIER_C_BASE * scale)), ) is_binding = _is_binding_task(task_id) cat = get_category(task_id) if cat is None and not is_binding: scale = max_points / _QUALITY_BASE if _QUALITY_BASE > 0 else 0 return ( int(round(_TIER_A_BASE * scale)), int(round(_TIER_B_BASE * scale)), int(round(_TIER_C_BASE * scale)), ) if is_binding: ratio_a = 12 / 35 ratio_b = 18 / 35 a = int(round(max_points * ratio_a)) b = int(round(max_points * ratio_b)) c = max_points - a - b return (a, b, c) else: ratio_a = 25 / 35 ratio_b = 10 / 35 a = int(round(max_points * ratio_a)) b = int(round(max_points * ratio_b)) c = max_points - a - b return (a, b, c) def _continuous_score( value: float, thresholds: dict[str, float], direction: str = "higher_is_better", ) -> float: """Return continuous fraction [0.0, 1.0] via linear interpolation.""" p, g, e = thresholds["pass"], thresholds["good"], thresholds["excellent"] if direction == "lower_is_better": floor = p + abs(p) * 0.3 if p != 0 else 0.3 if value <= e: return 1.0 if value >= floor: return 0.0 if value <= g: span = g - e if span == 0: return 1.0 return 0.66 + (g - value) / span * 0.34 if value <= p: span = p - g if span == 0: return 0.66 return 0.33 + (p - value) / span * 0.33 span = floor - p if span == 0: return 0.0 return 0.33 * (floor - value) / span # higher_is_better floor = p * 0.7 if value >= e: return 1.0 if value <= floor: return 0.0 if value >= g: span = e - g if span == 0: return 1.0 return 0.66 + (value - g) / span * 0.34 if value >= p: span = g - p if span == 0: return 0.66 return 0.33 + (value - p) / span * 0.33 span = p - floor if span == 0: return 0.0 return 0.33 * (value - floor) / span # Category-specific quality metrics (17 valid taxonomy cells) QUALITY_METRICS: dict[tuple[DesignTaskType, BiologicalContext], dict[str, Any]] = { # de_novo_binder (4 cells) (DesignTaskType.DE_NOVO_BINDER, BiologicalContext.ANTIBODY): { "primary_metric": "ipTM", "thresholds": {"excellent": 0.75, "good": 0.50, "pass": 0.20}, "secondary_metrics": ["pLDDT", "predicted_kd"], }, (DesignTaskType.DE_NOVO_BINDER, BiologicalContext.SIGNALING): { "primary_metric": "ipTM", "thresholds": {"excellent": 0.70, "good": 0.45, "pass": 0.18}, "secondary_metrics": ["pLDDT", "predicted_kd"], }, (DesignTaskType.DE_NOVO_BINDER, BiologicalContext.THERAPEUTIC): { "primary_metric": "ipTM", "thresholds": {"excellent": 0.70, "good": 0.45, "pass": 0.18}, "secondary_metrics": ["pLDDT", "predicted_kd"], }, (DesignTaskType.DE_NOVO_BINDER, BiologicalContext.ENZYME): { "primary_metric": "ipTM", "thresholds": {"excellent": 0.70, "good": 0.45, "pass": 0.18}, "secondary_metrics": ["pLDDT", "predicted_kd", "active_site_rmsd"], }, # sequence_optimization (5 cells) (DesignTaskType.SEQUENCE_OPTIMIZATION, BiologicalContext.ANTIBODY): { "primary_metric": "pLDDT", "thresholds": {"excellent": 90, "good": 80, "pass": 65}, "secondary_metrics": ["ipTM", "max_sequence_identity"], }, (DesignTaskType.SEQUENCE_OPTIMIZATION, BiologicalContext.ENZYME): { "primary_metric": "pLDDT", "thresholds": {"excellent": 90, "good": 80, "pass": 65}, "secondary_metrics": ["predicted_ddG", "active_site_rmsd"], }, (DesignTaskType.SEQUENCE_OPTIMIZATION, BiologicalContext.STRUCTURAL): { "primary_metric": "pLDDT", "thresholds": {"excellent": 92, "good": 82, "pass": 68}, "secondary_metrics": ["TM_score", "predicted_ddG"], }, (DesignTaskType.SEQUENCE_OPTIMIZATION, BiologicalContext.FLUORESCENT): { "primary_metric": "pLDDT", "thresholds": {"excellent": 88, "good": 78, "pass": 62}, "secondary_metrics": ["predicted_ddG", "max_sequence_identity"], }, (DesignTaskType.SEQUENCE_OPTIMIZATION, BiologicalContext.SIGNALING): { "primary_metric": "pLDDT", "thresholds": {"excellent": 90, "good": 80, "pass": 65}, "secondary_metrics": ["ipTM", "predicted_ddG"], }, # de_novo_backbone (1 cell) (DesignTaskType.DE_NOVO_BACKBONE, BiologicalContext.STRUCTURAL): { "primary_metric": "pLDDT", "thresholds": {"excellent": 88, "good": 78, "pass": 60}, "secondary_metrics": ["TM_score", "predicted_ddG"], }, # complex_engineering (3 cells) (DesignTaskType.COMPLEX_ENGINEERING, BiologicalContext.SIGNALING): { "primary_metric": "ipTM", "thresholds": {"excellent": 0.72, "good": 0.48, "pass": 0.20}, "secondary_metrics": ["pLDDT", "predicted_kd"], }, (DesignTaskType.COMPLEX_ENGINEERING, BiologicalContext.STRUCTURAL): { "primary_metric": "ipTM", "thresholds": {"excellent": 0.72, "good": 0.48, "pass": 0.20}, "secondary_metrics": ["pLDDT", "TM_score"], }, (DesignTaskType.COMPLEX_ENGINEERING, BiologicalContext.ENZYME): { "primary_metric": "ipTM", "thresholds": {"excellent": 0.70, "good": 0.45, "pass": 0.18}, "secondary_metrics": ["pLDDT", "predicted_kd", "active_site_rmsd"], }, # conformational_design (4 cells) (DesignTaskType.CONFORMATIONAL_DESIGN, BiologicalContext.ENZYME): { "primary_metric": "pLDDT", "thresholds": {"excellent": 88, "good": 78, "pass": 62}, "secondary_metrics": ["predicted_ddG", "active_site_rmsd"], }, (DesignTaskType.CONFORMATIONAL_DESIGN, BiologicalContext.SIGNALING): { "primary_metric": "pLDDT", "thresholds": {"excellent": 85, "good": 75, "pass": 60}, "secondary_metrics": ["ipTM", "predicted_kd"], }, (DesignTaskType.CONFORMATIONAL_DESIGN, BiologicalContext.FLUORESCENT): { "primary_metric": "pLDDT", "thresholds": {"excellent": 85, "good": 75, "pass": 60}, "secondary_metrics": ["predicted_ddG", "max_sequence_identity"], }, (DesignTaskType.CONFORMATIONAL_DESIGN, BiologicalContext.STRUCTURAL): { "primary_metric": "pLDDT", "thresholds": {"excellent": 88, "good": 78, "pass": 62}, "secondary_metrics": ["TM_score", "predicted_ddG"], }, } def get_quality_config(task_id: str) -> dict[str, Any] | None: category = get_category(task_id) if category is None: return None key = (category.task_type, category.context) return QUALITY_METRICS.get(key) @dataclass class DesignScoringRubric: components: dict[str, int] = field(default_factory=lambda: dict(DEFAULT_DESIGN_RUBRIC)) @property def max_score(self) -> int: return sum(self.components.values()) def validate(self) -> None: total = sum(self.components.values()) if total != 100: raise ValueError(f"Rubric total must be 100, got {total}") def _has_reasonable_composition(seq: str, min_length: int = 20) -> bool: upper = seq.upper() if len(upper) < min_length: return False unique_aas = len(set(upper)) if unique_aas < 5: return False counts = Counter(upper) max_fraction = max(counts.values()) / len(upper) if max_fraction > 0.5: return False ala_fraction = counts.get("A", 0) / len(upper) if ala_fraction > 0.3: return False hp = hydrophobicity_profile(upper) if hp["mean"] > 2.0: return False return True def validate_metric_range(name: str, value: float) -> bool: if name not in METRIC_RANGES: return True low, high = METRIC_RANGES[name] return low <= value <= high # Functional Similarity thresholds for non-binding Tier B _FUNCTIONAL_SIM_DEFAULTS: dict[DesignTaskType, dict[str, float]] = { DesignTaskType.SEQUENCE_OPTIMIZATION: {"pass": 0.40, "good": 0.60, "excellent": 0.85}, DesignTaskType.CONFORMATIONAL_DESIGN: {"pass": 0.15, "good": 0.30, "excellent": 0.50}, DesignTaskType.DE_NOVO_BACKBONE: {"pass": 0.10, "good": 0.20, "excellent": 0.40}, } def _derive_functional_sim_thresholds(value: float) -> dict[str, float]: return { "pass": value * 0.5, "good": value, "excellent": min(value * 2, 1.0), } def _get_functional_sim_thresholds( thresholds: dict[str, float], task_id: str, ) -> dict[str, float] | None: if _is_binding_task(task_id): return None gt_value = thresholds.get("max_seq_identity_good") if gt_value is not None: return _derive_functional_sim_thresholds(gt_value) cat = get_category(task_id) if cat is None: return None return _FUNCTIONAL_SIM_DEFAULTS.get(cat.task_type) def _score_functional_similarity( designs: list[str], oracle_sequences: list[str], thresholds: dict[str, float], ) -> float | None: if not designs or not oracle_sequences: return None best_identity = 0.0 for design in designs: for oracle in oracle_sequences: ident = sequence_identity(design, oracle) if ident > best_identity: best_identity = ident return _continuous_score(best_identity, thresholds, "higher_is_better") def score_quality( agent_metrics: dict[str, float], thresholds: dict[str, float], max_points: int = 35, task_id: str | None = None, designs: list[str] | None = None, oracle_sequences: list[str] | None = None, ) -> dict[str, Any]: """Score quality using 3-tier continuous system.""" valid_metrics = { k: v for k, v in agent_metrics.items() if validate_metric_range(k, v) } for extra_key in ("buried_surface_area", "hydrogen_bonds"): if extra_key in agent_metrics and extra_key not in valid_metrics: val = agent_metrics[extra_key] if isinstance(val, (int, float)) and val >= 0: valid_metrics[extra_key] = float(val) tier_a_max, tier_b_max, tier_c_max = _get_tier_weights(task_id, max_points) is_binding = _is_binding_task(task_id) overrides: dict[str, dict[str, float]] = {} if task_id: config = get_quality_config(task_id) if config and "thresholds" in config: primary = config["primary_metric"] overrides[primary] = config["thresholds"] # Tier A: Structure Confidence tier_a_scores: dict[str, float] = {} for metric, default_thresh in _TIER_A_THRESHOLDS.items(): if metric in valid_metrics: thresh = overrides.get(metric, default_thresh) tier_a_scores[metric] = _continuous_score( valid_metrics[metric], thresh, "higher_is_better" ) tier_a_pts = (sum(tier_a_scores.values()) / len(tier_a_scores)) * tier_a_max if tier_a_scores else 0.0 # Tier B: Interface or Functional Similarity tier_b_scores: dict[str, float] = {} tier_b_pts = 0.0 _use_functional_sim = ( tier_b_max > 0 and task_id is not None and not is_binding and get_category(task_id) is not None ) if tier_b_max > 0: if _use_functional_sim: if designs and oracle_sequences: func_thresh = _get_functional_sim_thresholds(thresholds, task_id) if func_thresh is not None: frac = _score_functional_similarity(designs, oracle_sequences, func_thresh) if frac is not None: tier_b_pts = frac * tier_b_max tier_b_scores["oracle_identity"] = frac else: for metric, default_thresh in _TIER_B_THRESHOLDS.items(): if metric in valid_metrics: thresh = overrides.get(metric, default_thresh) direction = _TIER_B_DIRECTIONS.get(metric, "higher_is_better") tier_b_scores[metric] = _continuous_score( valid_metrics[metric], thresh, direction ) if tier_b_scores: tier_b_pts = (sum(tier_b_scores.values()) / len(tier_b_scores)) * tier_b_max # Tier C: Interface Physics tier_c_fractions: list[float] = [] tier_c_breakdown: list[dict] = [] if tier_c_max > 0: if is_binding: for metric_key, phys_thresh in _TIER_C_PHYSICS.items(): if metric_key in valid_metrics: frac = _continuous_score(valid_metrics[metric_key], phys_thresh, "higher_is_better") tier_c_fractions.append(frac) tier_c_breakdown.append({ "threshold": metric_key, "metric": metric_key, "value": valid_metrics[metric_key], "threshold_value": phys_thresh, "fraction": round(frac, 3), }) for thresh_key, (metric_key, direction) in _TIER_C_METRICS.items(): if thresh_key in thresholds and metric_key in valid_metrics: threshold_val = thresholds[thresh_key] agent_val = valid_metrics[metric_key] margin = abs(threshold_val) * 0.5 if threshold_val != 0 else 1.0 if direction == "lower_is_better": gt_thresh = { "pass": threshold_val + margin, "good": threshold_val, "excellent": threshold_val - margin, } else: gt_thresh = { "pass": threshold_val - margin, "good": threshold_val, "excellent": threshold_val + margin, } frac = _continuous_score(agent_val, gt_thresh, direction) tier_c_fractions.append(frac) tier_c_breakdown.append({ "threshold": thresh_key, "metric": metric_key, "value": agent_val, "threshold_value": threshold_val, "fraction": round(frac, 3), }) tier_c_pts = (sum(tier_c_fractions) / len(tier_c_fractions)) * tier_c_max if tier_c_fractions else 0.0 total = min(tier_a_pts + tier_b_pts + tier_c_pts, max_points) metrics_evaluated = len(tier_a_scores) + len(tier_b_scores) + len(tier_c_fractions) return { "score": int(round(total)), "max": max_points, "tier_a": round(tier_a_pts, 1), "tier_b": round(tier_b_pts, 1), "tier_c": round(tier_c_pts, 1), "metrics_evaluated": metrics_evaluated, "breakdown": { "structure": tier_a_scores, "interface": tier_b_scores, "physics": tier_c_breakdown, }, } def score_novelty( designs: list[str], reference_seq: str | None, thresholds: dict[str, float], max_points: int = 5, ) -> dict[str, Any]: """Score novelty by computing sequence identity to reference.""" if not designs: return {"score": 0, "max": max_points, "max_identity": 0.0, "identity_threshold": None} identity_threshold = thresholds.get("max_seq_identity_good") max_id = max_identity_to_reference(designs, reference_seq) if reference_seq else 0.0 if identity_threshold is None: if reference_seq: novelty_ratio = 1.0 - max_id score = int(round(max_points * min(novelty_ratio * 2, 1.0))) else: score = max_points elif identity_threshold >= 0.9: if max_id >= identity_threshold: score = max_points elif max_id >= identity_threshold * 0.9: score = int(round(max_points * 0.7)) else: score = int(round(max_points * 0.3)) else: if max_id <= identity_threshold: score = max_points elif max_id <= identity_threshold * 1.5: score = int(round(max_points * 0.5)) else: score = int(round(max_points * 0.2)) return { "score": min(score, max_points), "max": max_points, "max_identity": round(max_id, 3), "identity_threshold": identity_threshold, } def score_diversity( designs: list[str], max_designs: int = 10, max_points: int = 5, ) -> dict[str, Any]: """Score diversity of designs.""" if not designs: return {"score": 0, "max": max_points, "num_designs": 0, "pairwise_diversity": 0.0, "entropy": 0.0} num = len(designs) count_fraction = min(num / max_designs, 1.0) if max_designs > 0 else 1.0 diversity = mean_pairwise_diversity(designs) entropy = sequence_entropy(designs) count_score = count_fraction * max_points * 0.4 diversity_score = diversity * max_points * 0.4 entropy_score = entropy * max_points * 0.2 total = int(round(count_score + diversity_score + entropy_score)) return { "score": min(total, max_points), "max": max_points, "num_designs": num, "pairwise_diversity": round(diversity, 3), "entropy": round(entropy, 3), } def score_feasibility( designs: list[str], constraints: dict[str, Any], max_points: int = 25, ) -> dict[str, Any]: """Score feasibility of designed sequences.""" if not designs: return {"score": 0, "max": max_points, "aa_validity": 0.0, "length_validity": 0.0, "composition_check": 0.0} per_check = max_points / 3 length_range = constraints.get("length_range") if isinstance(length_range, list): length_range = tuple(length_range) comp_min_length = 20 if length_range and length_range[1] < 20: comp_min_length = max(length_range[0], 5) aa_valid_count = sum(1 for seq in designs if validate_amino_acids(seq)["valid"]) aa_fraction = aa_valid_count / len(designs) length_valid_count = sum(1 for seq in designs if check_length_constraints(seq, length_range)["within_range"]) length_fraction = length_valid_count / len(designs) composition_ok = sum(1 for seq in designs if _has_reasonable_composition(seq, min_length=comp_min_length)) composition_fraction = composition_ok / len(designs) aa_score = aa_fraction * per_check length_score = length_fraction * per_check comp_score = composition_fraction * per_check total = int(round(aa_score + length_score + comp_score)) return { "score": min(total, max_points), "max": max_points, "aa_validity": round(aa_fraction, 3), "length_validity": round(length_fraction, 3), "composition_check": round(composition_fraction, 3), } # ═══════════════════════════════════════════════════════════════════════════════ # SECTION 6 — Design Gate + Final Score # ═══════════════════════════════════════════════════════════════════════════════ _DESIGN_GATE_ZEROED = {"quality", "novelty", "diversity", "feasibility"} _DESIGN_GATE_CAP = 30 def apply_design_gate( component_scores: dict[str, int], num_designs: int, ) -> dict[str, int]: """If no designs produced, cap total at 30.""" if num_designs >= 1: return dict(component_scores) gated = dict(component_scores) for key in _DESIGN_GATE_ZEROED: gated[key] = 0 remaining_sum = sum(v for k, v in gated.items() if k not in _DESIGN_GATE_ZEROED) if remaining_sum > _DESIGN_GATE_CAP: scale = _DESIGN_GATE_CAP / remaining_sum for key in gated: if key not in _DESIGN_GATE_ZEROED: gated[key] = int(round(gated[key] * scale)) return gated def calculate_design_score( rubric: DesignScoringRubric, results: dict[str, int], ) -> dict[str, Any]: """Calculate final design task score from component results.""" breakdown = {} for component, max_pts in rubric.components.items(): actual = min(results.get(component, 0), max_pts) breakdown[component] = {"score": actual, "max": max_pts} total = sum(v["score"] for v in breakdown.values()) max_possible = rubric.max_score return { "breakdown": breakdown, "total": total, "max_possible": max_possible, "percentage": round(total / max_possible * 100, 1) if max_possible > 0 else 0, } # ═══════════════════════════════════════════════════════════════════════════════ # SECTION 7 — Full Task Scorer (high-level API for eval pipeline) # ═══════════════════════════════════════════════════════════════════════════════ def score_submission_task( task_id: str, sequences: list[str], run_log: list[dict[str, Any]], ground_truth: dict[str, Any], agent_metrics: dict[str, float] | None = None, oracle_sequences: list[str] | None = None, ) -> dict[str, Any]: """Score a single task submission end-to-end. This is the main entry point for the evaluation pipeline. Args: task_id: Task identifier (e.g., "dnb_sig_001"). sequences: Designed amino acid sequences from the agent. run_log: Tool call log from the agent. ground_truth: Ground truth dict with thresholds, reference_sequence, design_constraints, tools_expected, max_designs. agent_metrics: Optional metrics reported by the agent or from Boltz (e.g., {"pLDDT": 85.0, "ipTM": 0.35}). oracle_sequences: Optional oracle sequences for functional similarity. Returns: Dict with: total_score, component_scores, details, num_designs. """ if agent_metrics is None: agent_metrics = {} # Extract fields from ground truth thresholds = ground_truth.get("thresholds", {}) reference_seq = ground_truth.get("reference_sequence") constraints = ground_truth.get("design_constraints", {}) tools_expected = ground_truth.get("tools_expected", []) max_designs = ground_truth.get("max_designs", 10) # Get task category for function-based scoring cat = get_category(task_id) task_type = cat.task_type if cat else None # Extract tools used from run_log tools_used = [entry.get("tool", "") for entry in run_log if entry.get("tool")] # Score all 6 components approach_result = score_approach( tools_used=tools_used, tools_expected=tools_expected, task_type=task_type, ) orchestration_result = score_orchestration( tool_call_log=run_log, task_id=task_id, ) quality_result = score_quality( agent_metrics=agent_metrics, thresholds=thresholds, task_id=task_id, designs=sequences, oracle_sequences=oracle_sequences, ) feasibility_result = score_feasibility( designs=sequences, constraints=constraints, ) novelty_result = score_novelty( designs=sequences, reference_seq=reference_seq, thresholds=thresholds, ) diversity_result = score_diversity( designs=sequences, max_designs=max_designs, ) # Build component scores dict component_scores = { "approach": approach_result["score"], "orchestration": orchestration_result["score"], "quality": quality_result["score"], "feasibility": feasibility_result["score"], "novelty": novelty_result["score"], "diversity": diversity_result["score"], } # Apply design gate num_designs = len(sequences) gated = apply_design_gate(component_scores, num_designs) total = sum(gated.values()) return { "total_score": total, "component_scores": gated, "num_designs": num_designs, "details": { "approach": approach_result, "orchestration": orchestration_result, "quality": quality_result, "feasibility": feasibility_result, "novelty": novelty_result, "diversity": diversity_result, }, } def aggregate_scores( per_task_scores: dict[str, dict[str, Any]], ) -> dict[str, Any]: """Aggregate per-task scores into an overall submission result. If `eval_judge.run_judge_panel()` has been run beforehand each task will carry `hybrid_scores` and `hybrid_total`; in that case we use the hybrid (algo + LLM judge, capped at rubric max) as the canonical score. Otherwise we fall back to the algo-only `component_scores` / `total_score` produced by the dispatcher + Boltz pipeline. """ if not per_task_scores: return { "overall_score": 0.0, "component_scores": {c: 0.0 for c in DEFAULT_DESIGN_RUBRIC}, "taxonomy_scores": {}, "tasks_completed": 0, "tasks_total": 0, "tasks_with_zero": 0, } totals = {c: 0.0 for c in DEFAULT_DESIGN_RUBRIC} n = len(per_task_scores) tasks_with_zero = 0 used_hybrid = False # Taxonomy breakdown taxonomy_scores: dict[str, dict[str, list[float]]] = {} for task_id, result in per_task_scores.items(): if "hybrid_scores" in result and "hybrid_total" in result: comp_scores = result["hybrid_scores"] total_score = result["hybrid_total"] used_hybrid = True else: comp_scores = result.get("component_scores", {}) total_score = result.get("total_score", 0.0) if total_score == 0: tasks_with_zero += 1 for comp, val in comp_scores.items(): totals[comp] += val # Taxonomy mapping cat = get_category(task_id) if cat: tt = cat.task_type.value ctx = cat.context.short taxonomy_scores.setdefault(tt, {}).setdefault(ctx, []).append(total_score) # Average components avg_components = {c: round(v / n, 1) for c, v in totals.items()} overall = round(sum(avg_components.values()), 1) # Average taxonomy scores taxonomy_avg: dict[str, dict[str, float]] = {} for tt, contexts in taxonomy_scores.items(): taxonomy_avg[tt] = {} for ctx, scores in contexts.items(): taxonomy_avg[tt][ctx] = round(sum(scores) / len(scores), 1) return { "overall_score": overall, "component_scores": avg_components, "taxonomy_scores": taxonomy_avg, "tasks_completed": n, "tasks_total": n, "tasks_with_zero": tasks_with_zero, "scoring_mode": "hybrid" if used_hybrid else "algo", }