/** * Knowledge Base registry. Each KB is a typed, named collection of domain * knowledge the agent can query during planning. Users select which KBs are * active per-task. The agent emits kb_lookup events with surfaced results * and query timings. * * KB types: * - embedded: shipped with the app (static content, zero latency) * - api: external endpoint the agent calls (future — e.g., PubMed, ClinicalTrials) * - file: user-uploaded documents indexed for retrieval (future) * - vector: RAG-indexed vector store (future — e.g., Proteinea internal docs) */ export type KBType = "embedded" | "api" | "file" | "vector"; export interface KnowledgeBase { id: string; name: string; description: string; type: KBType; /** Number of entries / documents in this KB. */ entryCount: number; /** Whether this KB is enabled by default for new tasks. */ defaultEnabled: boolean; /** Icon hint for the UI (emoji or icon name). */ icon: string; /** Tags for filtering / grouping. */ tags: string[]; } export interface KBEntry { id: string; title: string; content: string; /** Source reference (paper DOI, URL, file path). */ source?: string; /** Relevance score 0-1 (for ranked retrieval). */ score?: number; } export interface KBQueryResult { kb: KnowledgeBase; entries: KBEntry[]; queryMs: number; } // ---- Embedded KB content ---- const ANTIBODY_DESIGN_KB: KBEntry[] = [ { id: "abd-001", title: "De novo antibody design pipeline", content: "Standard pipeline: backbone generation (RFdiffusion) → sequence design (ProteinMPNN) → structure prediction filter (RF2/AF2, iPAE < 10) → scoring consensus (Chai-1 + AF2 + Protenix, 3-model agreement) → developability filter (Agmata aggregation < 0.3, humanness > 0.8). Typical yield: 5-15% of generated backbones pass all filters.", source: "Proteinea internal SOP v3.2", }, { id: "abd-002", title: "Format selection guidelines", content: "VHH (nanobody, 15 kDa): best for tissue penetration, no Fc effector functions, ideal for imaging and intracellular targets. scFv (27 kDa): similar penetration, can be reformatted to full IgG. mAb IgG1: full Fc, enables ADCC/CDC, standard for oncology. mAb IgG4: Fc-silent, preferred for blocking/neutralization without cell killing. Bispecific: dual-target engagement, complex manufacturing.", source: "Proteinea format decision tree", }, { id: "abd-003", title: "Affinity maturation strategies", content: "In silico maturation: CDR walking with ProteinMPNN (fix framework, randomize CDR positions). Typical improvement: 3-10x affinity gain per round. Library size: 1000-5000 variants per CDR. Scoring: iPAE + binding energy (Rosetta ΔΔG). Key risk: aggregation propensity increases with higher affinity — always co-optimize developability.", source: "Proteinea maturation protocol", }, { id: "abd-004", title: "Developability assessment checklist", content: "Critical metrics: aggregation propensity (Agmata score < 0.3), thermostability (Tm > 65°C for VHH, > 70°C for IgG), expression yield (> 0.5 g/L CHO for IgG, > 5 mg/L E.coli for VHH), humanness score (OASis > 0.8), polyreactivity (ELISA panel negative), viscosity (< 20 cP at 150 mg/mL for SC formulation).", source: "Proteinea developability SOP", }, { id: "abd-005", title: "Epitope mapping approaches", content: "Computational: docking with ClusPro/HADDOCK, interface residue prediction with Chai-1 iPAE heatmaps. Experimental: HDX-MS (gold standard), alanine scanning, peptide arrays. For therapeutic differentiation: map against known antibody epitopes (trastuzumab epitope on HER2 domain IV, cetuximab on EGFR domain III) to identify novel binding sites.", source: "Proteinea epitope mapping guide", }, ]; const BENCHMARK_KB: KBEntry[] = [ { id: "bkb-001", title: "Benchmark interpretation guide", content: "Hit rate is the fraction of designs that bind the target (SPR/BLI KD < 100 nM). Affinity is reported as the best single binder. Target coverage is the fraction of tested targets with at least one hit. Key caveat: benchmarks test different targets — JAM-2 covers 16 antigens including GPCRs, Chai-2 only 4 soluble. Direct comparison requires matched-target analysis.", source: "Proteinea benchmark methodology", }, { id: "bkb-002", title: "JAM-2 benchmark deep dive", content: "Joint Antibody-antigen Model 2 (Dec 2025). 748 designs across 16 antigens. VHH hit rate 39%, mAb 18%. Best affinity: 170 pM (HER2 VHH). Strengths: broad target coverage, joint VH/VL prediction. Weaknesses: not yet open-source, developability pass rate only 57%. The current state-of-the-art for de novo antibody design by hit rate.", source: "https://arxiv.org/abs/2512.20605", }, { id: "bkb-003", title: "When to use which benchmark", content: "For VHH-focused programs: JAM-2 (highest hit rate at 39%), RFantibody (open-source, 15%). For developability-critical programs: Chai-2 (86% pass rate vs JAM-2's 57%). For mAb programs: JAM-2 is the only benchmark with mAb-format results. For rapid iteration: DiffAb/dyMEAN have open code but lower hit rates (6-8%).", source: "Proteinea benchmark selection guide", }, ]; const TARGET_BIOLOGY_KB: KBEntry[] = [ { id: "tbk-001", title: "Target class considerations", content: "Soluble targets (HER2 ECD, TNF-alpha, IL-6): well-suited for all formats, crystal structures widely available. Membrane targets (PD-L1, CD20): require careful epitope selection, VHH preferred for deep cleft access. GPCRs (CXCR7, CXCR4): hardest class — limited epitopes, dynamic conformations, VHH essential for accessing extracellular loops. Hit rates for GPCRs are typically 3-5x lower than soluble targets.", source: "Proteinea target assessment framework", }, { id: "tbk-002", title: "Competitive landscape awareness", content: "Before designing: check existing approved antibodies (trastuzumab/HER2, pembrolizumab/PD-1, rituximab/CD20) and their epitopes. Differentiation strategies: novel epitope, higher affinity, better developability, bispecific engagement, enhanced effector function (afucosylation for ADCC). Key databases: IMGT, Thera-SAbDab, DrugBank.", source: "Proteinea BD intelligence", }, ]; const MODEL_REGISTRY_KB: KBEntry[] = [ { id: "mrk-001", title: "Model selection decision tree", content: "Step 1: Choose backbone generator (RFdiffusion for de novo, ABodyBuilder2 for humanization). Step 2: Choose sequence designer (ProteinMPNN, always). Step 3: Choose scoring stack — for speed: ESMFold (CPU, seconds); for accuracy: AF2 + Chai-1 consensus (GPU, minutes); for production: 3-model consensus (AF2 + Chai-1 + Protenix). Step 4: Choose developability predictor (AbLang for humanness, Agmata for aggregation).", source: "Proteinea model selection SOP", }, { id: "mrk-002", title: "GPU requirements and scaling", content: "RFdiffusion: 16GB VRAM, ~30s per backbone. ProteinMPNN: 8GB, ~5s per sequence. AF2: 24GB for full model, 16GB for AF2-multimer-reduced. Chai-1: 24GB, ~2 min per complex. For 50-design campaign: ~45 min on single A100. For 500-design campaign: recommend 4x A100 or cloud batch (RunPod, ~$2/hr per A100).", source: "Proteinea infrastructure guide", }, ]; // ---- Registry ---- export const KNOWLEDGE_BASES: KnowledgeBase[] = [ { id: "kb-antibody-design", name: "Antibody Design", description: "Design pipeline SOPs, format selection, affinity maturation, developability assessment, epitope mapping. Proteinea's internal best practices.", type: "embedded", entryCount: ANTIBODY_DESIGN_KB.length, defaultEnabled: true, icon: "🧬", tags: ["design", "pipeline", "developability"], }, { id: "kb-benchmarks", name: "Benchmark Catalog", description: "Interpretation guides, deep dives on JAM-2/Chai-2/RFantibody, benchmark selection criteria for different program types.", type: "embedded", entryCount: BENCHMARK_KB.length, defaultEnabled: true, icon: "📊", tags: ["benchmarks", "evaluation", "comparison"], }, { id: "kb-target-biology", name: "Target Biology", description: "Target class considerations, competitive landscape awareness, epitope differentiation strategies.", type: "embedded", entryCount: TARGET_BIOLOGY_KB.length, defaultEnabled: true, icon: "🎯", tags: ["targets", "biology", "competitive"], }, { id: "kb-model-registry", name: "Model Registry", description: "Model selection decision trees, GPU requirements, scaling guidance for production campaigns.", type: "embedded", entryCount: MODEL_REGISTRY_KB.length, defaultEnabled: false, icon: "🤖", tags: ["models", "infrastructure", "gpu"], }, ]; // ---- Content map for embedded retrieval ---- const KB_CONTENT: Record = { "kb-antibody-design": ANTIBODY_DESIGN_KB, "kb-benchmarks": BENCHMARK_KB, "kb-target-biology": TARGET_BIOLOGY_KB, "kb-model-registry": MODEL_REGISTRY_KB, }; /** * Query a knowledge base by keyword. Returns matching entries ranked by * a simple keyword-overlap score. For embedded KBs this is instant; for * future api/vector KBs this would be an async call. */ export function queryKB(kbId: string, query: string, limit = 3): KBQueryResult | null { const kb = KNOWLEDGE_BASES.find((k) => k.id === kbId); if (!kb) return null; const entries = KB_CONTENT[kbId]; if (!entries) return null; const t0 = performance.now(); const q = query.toLowerCase().split(/\s+/).filter(Boolean); const scored = entries.map((entry) => { const text = `${entry.title} ${entry.content}`.toLowerCase(); const hits = q.filter((w) => text.includes(w)).length; const score = q.length > 0 ? hits / q.length : 0; return { ...entry, score }; }); scored.sort((a, b) => (b.score ?? 0) - (a.score ?? 0)); const top = scored.slice(0, limit).filter((e) => (e.score ?? 0) > 0); const queryMs = Math.round(performance.now() - t0); return { kb, entries: top, queryMs }; } /** * Query multiple KBs and return all results. This is what the agent calls * during pre-plan retrieval. */ export function queryMultipleKBs( kbIds: string[], query: string, limit = 3, ): KBQueryResult[] { return kbIds .map((id) => queryKB(id, query, limit)) .filter((r): r is KBQueryResult => r !== null && r.entries.length > 0); } export function listKnowledgeBases(): KnowledgeBase[] { return KNOWLEDGE_BASES; } export function getKnowledgeBase(id: string): KnowledgeBase | null { return KNOWLEDGE_BASES.find((k) => k.id === id) ?? null; } export function getDefaultKBIds(): string[] { return KNOWLEDGE_BASES.filter((k) => k.defaultEnabled).map((k) => k.id); }