File size: 10,870 Bytes
30cc31a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 | /**
* Knowledge Base registry. Each KB is a typed, named collection of domain
* knowledge the agent can query during planning. Users select which KBs are
* active per-task. The agent emits kb_lookup events with surfaced results
* and query timings.
*
* KB types:
* - embedded: shipped with the app (static content, zero latency)
* - api: external endpoint the agent calls (future β e.g., PubMed, ClinicalTrials)
* - file: user-uploaded documents indexed for retrieval (future)
* - vector: RAG-indexed vector store (future β e.g., Proteinea internal docs)
*/
export type KBType = "embedded" | "api" | "file" | "vector";
export interface KnowledgeBase {
id: string;
name: string;
description: string;
type: KBType;
/** Number of entries / documents in this KB. */
entryCount: number;
/** Whether this KB is enabled by default for new tasks. */
defaultEnabled: boolean;
/** Icon hint for the UI (emoji or icon name). */
icon: string;
/** Tags for filtering / grouping. */
tags: string[];
}
export interface KBEntry {
id: string;
title: string;
content: string;
/** Source reference (paper DOI, URL, file path). */
source?: string;
/** Relevance score 0-1 (for ranked retrieval). */
score?: number;
}
export interface KBQueryResult {
kb: KnowledgeBase;
entries: KBEntry[];
queryMs: number;
}
// ---- Embedded KB content ----
const ANTIBODY_DESIGN_KB: KBEntry[] = [
{
id: "abd-001",
title: "De novo antibody design pipeline",
content: "Standard pipeline: backbone generation (RFdiffusion) β sequence design (ProteinMPNN) β structure prediction filter (RF2/AF2, iPAE < 10) β scoring consensus (Chai-1 + AF2 + Protenix, 3-model agreement) β developability filter (Agmata aggregation < 0.3, humanness > 0.8). Typical yield: 5-15% of generated backbones pass all filters.",
source: "Proteinea internal SOP v3.2",
},
{
id: "abd-002",
title: "Format selection guidelines",
content: "VHH (nanobody, 15 kDa): best for tissue penetration, no Fc effector functions, ideal for imaging and intracellular targets. scFv (27 kDa): similar penetration, can be reformatted to full IgG. mAb IgG1: full Fc, enables ADCC/CDC, standard for oncology. mAb IgG4: Fc-silent, preferred for blocking/neutralization without cell killing. Bispecific: dual-target engagement, complex manufacturing.",
source: "Proteinea format decision tree",
},
{
id: "abd-003",
title: "Affinity maturation strategies",
content: "In silico maturation: CDR walking with ProteinMPNN (fix framework, randomize CDR positions). Typical improvement: 3-10x affinity gain per round. Library size: 1000-5000 variants per CDR. Scoring: iPAE + binding energy (Rosetta ΞΞG). Key risk: aggregation propensity increases with higher affinity β always co-optimize developability.",
source: "Proteinea maturation protocol",
},
{
id: "abd-004",
title: "Developability assessment checklist",
content: "Critical metrics: aggregation propensity (Agmata score < 0.3), thermostability (Tm > 65Β°C for VHH, > 70Β°C for IgG), expression yield (> 0.5 g/L CHO for IgG, > 5 mg/L E.coli for VHH), humanness score (OASis > 0.8), polyreactivity (ELISA panel negative), viscosity (< 20 cP at 150 mg/mL for SC formulation).",
source: "Proteinea developability SOP",
},
{
id: "abd-005",
title: "Epitope mapping approaches",
content: "Computational: docking with ClusPro/HADDOCK, interface residue prediction with Chai-1 iPAE heatmaps. Experimental: HDX-MS (gold standard), alanine scanning, peptide arrays. For therapeutic differentiation: map against known antibody epitopes (trastuzumab epitope on HER2 domain IV, cetuximab on EGFR domain III) to identify novel binding sites.",
source: "Proteinea epitope mapping guide",
},
];
const BENCHMARK_KB: KBEntry[] = [
{
id: "bkb-001",
title: "Benchmark interpretation guide",
content: "Hit rate is the fraction of designs that bind the target (SPR/BLI KD < 100 nM). Affinity is reported as the best single binder. Target coverage is the fraction of tested targets with at least one hit. Key caveat: benchmarks test different targets β JAM-2 covers 16 antigens including GPCRs, Chai-2 only 4 soluble. Direct comparison requires matched-target analysis.",
source: "Proteinea benchmark methodology",
},
{
id: "bkb-002",
title: "JAM-2 benchmark deep dive",
content: "Joint Antibody-antigen Model 2 (Dec 2025). 748 designs across 16 antigens. VHH hit rate 39%, mAb 18%. Best affinity: 170 pM (HER2 VHH). Strengths: broad target coverage, joint VH/VL prediction. Weaknesses: not yet open-source, developability pass rate only 57%. The current state-of-the-art for de novo antibody design by hit rate.",
source: "https://arxiv.org/abs/2512.20605",
},
{
id: "bkb-003",
title: "When to use which benchmark",
content: "For VHH-focused programs: JAM-2 (highest hit rate at 39%), RFantibody (open-source, 15%). For developability-critical programs: Chai-2 (86% pass rate vs JAM-2's 57%). For mAb programs: JAM-2 is the only benchmark with mAb-format results. For rapid iteration: DiffAb/dyMEAN have open code but lower hit rates (6-8%).",
source: "Proteinea benchmark selection guide",
},
];
const TARGET_BIOLOGY_KB: KBEntry[] = [
{
id: "tbk-001",
title: "Target class considerations",
content: "Soluble targets (HER2 ECD, TNF-alpha, IL-6): well-suited for all formats, crystal structures widely available. Membrane targets (PD-L1, CD20): require careful epitope selection, VHH preferred for deep cleft access. GPCRs (CXCR7, CXCR4): hardest class β limited epitopes, dynamic conformations, VHH essential for accessing extracellular loops. Hit rates for GPCRs are typically 3-5x lower than soluble targets.",
source: "Proteinea target assessment framework",
},
{
id: "tbk-002",
title: "Competitive landscape awareness",
content: "Before designing: check existing approved antibodies (trastuzumab/HER2, pembrolizumab/PD-1, rituximab/CD20) and their epitopes. Differentiation strategies: novel epitope, higher affinity, better developability, bispecific engagement, enhanced effector function (afucosylation for ADCC). Key databases: IMGT, Thera-SAbDab, DrugBank.",
source: "Proteinea BD intelligence",
},
];
const MODEL_REGISTRY_KB: KBEntry[] = [
{
id: "mrk-001",
title: "Model selection decision tree",
content: "Step 1: Choose backbone generator (RFdiffusion for de novo, ABodyBuilder2 for humanization). Step 2: Choose sequence designer (ProteinMPNN, always). Step 3: Choose scoring stack β for speed: ESMFold (CPU, seconds); for accuracy: AF2 + Chai-1 consensus (GPU, minutes); for production: 3-model consensus (AF2 + Chai-1 + Protenix). Step 4: Choose developability predictor (AbLang for humanness, Agmata for aggregation).",
source: "Proteinea model selection SOP",
},
{
id: "mrk-002",
title: "GPU requirements and scaling",
content: "RFdiffusion: 16GB VRAM, ~30s per backbone. ProteinMPNN: 8GB, ~5s per sequence. AF2: 24GB for full model, 16GB for AF2-multimer-reduced. Chai-1: 24GB, ~2 min per complex. For 50-design campaign: ~45 min on single A100. For 500-design campaign: recommend 4x A100 or cloud batch (RunPod, ~$2/hr per A100).",
source: "Proteinea infrastructure guide",
},
];
// ---- Registry ----
export const KNOWLEDGE_BASES: KnowledgeBase[] = [
{
id: "kb-antibody-design",
name: "Antibody Design",
description: "Design pipeline SOPs, format selection, affinity maturation, developability assessment, epitope mapping. Proteinea's internal best practices.",
type: "embedded",
entryCount: ANTIBODY_DESIGN_KB.length,
defaultEnabled: true,
icon: "π§¬",
tags: ["design", "pipeline", "developability"],
},
{
id: "kb-benchmarks",
name: "Benchmark Catalog",
description: "Interpretation guides, deep dives on JAM-2/Chai-2/RFantibody, benchmark selection criteria for different program types.",
type: "embedded",
entryCount: BENCHMARK_KB.length,
defaultEnabled: true,
icon: "π",
tags: ["benchmarks", "evaluation", "comparison"],
},
{
id: "kb-target-biology",
name: "Target Biology",
description: "Target class considerations, competitive landscape awareness, epitope differentiation strategies.",
type: "embedded",
entryCount: TARGET_BIOLOGY_KB.length,
defaultEnabled: true,
icon: "π―",
tags: ["targets", "biology", "competitive"],
},
{
id: "kb-model-registry",
name: "Model Registry",
description: "Model selection decision trees, GPU requirements, scaling guidance for production campaigns.",
type: "embedded",
entryCount: MODEL_REGISTRY_KB.length,
defaultEnabled: false,
icon: "π€",
tags: ["models", "infrastructure", "gpu"],
},
];
// ---- Content map for embedded retrieval ----
const KB_CONTENT: Record<string, KBEntry[]> = {
"kb-antibody-design": ANTIBODY_DESIGN_KB,
"kb-benchmarks": BENCHMARK_KB,
"kb-target-biology": TARGET_BIOLOGY_KB,
"kb-model-registry": MODEL_REGISTRY_KB,
};
/**
* Query a knowledge base by keyword. Returns matching entries ranked by
* a simple keyword-overlap score. For embedded KBs this is instant; for
* future api/vector KBs this would be an async call.
*/
export function queryKB(kbId: string, query: string, limit = 3): KBQueryResult | null {
const kb = KNOWLEDGE_BASES.find((k) => k.id === kbId);
if (!kb) return null;
const entries = KB_CONTENT[kbId];
if (!entries) return null;
const t0 = performance.now();
const q = query.toLowerCase().split(/\s+/).filter(Boolean);
const scored = entries.map((entry) => {
const text = `${entry.title} ${entry.content}`.toLowerCase();
const hits = q.filter((w) => text.includes(w)).length;
const score = q.length > 0 ? hits / q.length : 0;
return { ...entry, score };
});
scored.sort((a, b) => (b.score ?? 0) - (a.score ?? 0));
const top = scored.slice(0, limit).filter((e) => (e.score ?? 0) > 0);
const queryMs = Math.round(performance.now() - t0);
return { kb, entries: top, queryMs };
}
/**
* Query multiple KBs and return all results. This is what the agent calls
* during pre-plan retrieval.
*/
export function queryMultipleKBs(
kbIds: string[],
query: string,
limit = 3,
): KBQueryResult[] {
return kbIds
.map((id) => queryKB(id, query, limit))
.filter((r): r is KBQueryResult => r !== null && r.entries.length > 0);
}
export function listKnowledgeBases(): KnowledgeBase[] {
return KNOWLEDGE_BASES;
}
export function getKnowledgeBase(id: string): KnowledgeBase | null {
return KNOWLEDGE_BASES.find((k) => k.id === id) ?? null;
}
export function getDefaultKBIds(): string[] {
return KNOWLEDGE_BASES.filter((k) => k.defaultEnabled).map((k) => k.id);
}
|