proteinea / src /lib /knowledge-bases.ts
Mahmoud Eljendy
feat: Antibody Studio — AI-native antibody design workspace by Proteinea
30cc31a
/**
* Knowledge Base registry. Each KB is a typed, named collection of domain
* knowledge the agent can query during planning. Users select which KBs are
* active per-task. The agent emits kb_lookup events with surfaced results
* and query timings.
*
* KB types:
* - embedded: shipped with the app (static content, zero latency)
* - api: external endpoint the agent calls (future — e.g., PubMed, ClinicalTrials)
* - file: user-uploaded documents indexed for retrieval (future)
* - vector: RAG-indexed vector store (future — e.g., Proteinea internal docs)
*/
export type KBType = "embedded" | "api" | "file" | "vector";
export interface KnowledgeBase {
id: string;
name: string;
description: string;
type: KBType;
/** Number of entries / documents in this KB. */
entryCount: number;
/** Whether this KB is enabled by default for new tasks. */
defaultEnabled: boolean;
/** Icon hint for the UI (emoji or icon name). */
icon: string;
/** Tags for filtering / grouping. */
tags: string[];
}
export interface KBEntry {
id: string;
title: string;
content: string;
/** Source reference (paper DOI, URL, file path). */
source?: string;
/** Relevance score 0-1 (for ranked retrieval). */
score?: number;
}
export interface KBQueryResult {
kb: KnowledgeBase;
entries: KBEntry[];
queryMs: number;
}
// ---- Embedded KB content ----
const ANTIBODY_DESIGN_KB: KBEntry[] = [
{
id: "abd-001",
title: "De novo antibody design pipeline",
content: "Standard pipeline: backbone generation (RFdiffusion) → sequence design (ProteinMPNN) → structure prediction filter (RF2/AF2, iPAE < 10) → scoring consensus (Chai-1 + AF2 + Protenix, 3-model agreement) → developability filter (Agmata aggregation < 0.3, humanness > 0.8). Typical yield: 5-15% of generated backbones pass all filters.",
source: "Proteinea internal SOP v3.2",
},
{
id: "abd-002",
title: "Format selection guidelines",
content: "VHH (nanobody, 15 kDa): best for tissue penetration, no Fc effector functions, ideal for imaging and intracellular targets. scFv (27 kDa): similar penetration, can be reformatted to full IgG. mAb IgG1: full Fc, enables ADCC/CDC, standard for oncology. mAb IgG4: Fc-silent, preferred for blocking/neutralization without cell killing. Bispecific: dual-target engagement, complex manufacturing.",
source: "Proteinea format decision tree",
},
{
id: "abd-003",
title: "Affinity maturation strategies",
content: "In silico maturation: CDR walking with ProteinMPNN (fix framework, randomize CDR positions). Typical improvement: 3-10x affinity gain per round. Library size: 1000-5000 variants per CDR. Scoring: iPAE + binding energy (Rosetta ΔΔG). Key risk: aggregation propensity increases with higher affinity — always co-optimize developability.",
source: "Proteinea maturation protocol",
},
{
id: "abd-004",
title: "Developability assessment checklist",
content: "Critical metrics: aggregation propensity (Agmata score < 0.3), thermostability (Tm > 65°C for VHH, > 70°C for IgG), expression yield (> 0.5 g/L CHO for IgG, > 5 mg/L E.coli for VHH), humanness score (OASis > 0.8), polyreactivity (ELISA panel negative), viscosity (< 20 cP at 150 mg/mL for SC formulation).",
source: "Proteinea developability SOP",
},
{
id: "abd-005",
title: "Epitope mapping approaches",
content: "Computational: docking with ClusPro/HADDOCK, interface residue prediction with Chai-1 iPAE heatmaps. Experimental: HDX-MS (gold standard), alanine scanning, peptide arrays. For therapeutic differentiation: map against known antibody epitopes (trastuzumab epitope on HER2 domain IV, cetuximab on EGFR domain III) to identify novel binding sites.",
source: "Proteinea epitope mapping guide",
},
];
const BENCHMARK_KB: KBEntry[] = [
{
id: "bkb-001",
title: "Benchmark interpretation guide",
content: "Hit rate is the fraction of designs that bind the target (SPR/BLI KD < 100 nM). Affinity is reported as the best single binder. Target coverage is the fraction of tested targets with at least one hit. Key caveat: benchmarks test different targets — JAM-2 covers 16 antigens including GPCRs, Chai-2 only 4 soluble. Direct comparison requires matched-target analysis.",
source: "Proteinea benchmark methodology",
},
{
id: "bkb-002",
title: "JAM-2 benchmark deep dive",
content: "Joint Antibody-antigen Model 2 (Dec 2025). 748 designs across 16 antigens. VHH hit rate 39%, mAb 18%. Best affinity: 170 pM (HER2 VHH). Strengths: broad target coverage, joint VH/VL prediction. Weaknesses: not yet open-source, developability pass rate only 57%. The current state-of-the-art for de novo antibody design by hit rate.",
source: "https://arxiv.org/abs/2512.20605",
},
{
id: "bkb-003",
title: "When to use which benchmark",
content: "For VHH-focused programs: JAM-2 (highest hit rate at 39%), RFantibody (open-source, 15%). For developability-critical programs: Chai-2 (86% pass rate vs JAM-2's 57%). For mAb programs: JAM-2 is the only benchmark with mAb-format results. For rapid iteration: DiffAb/dyMEAN have open code but lower hit rates (6-8%).",
source: "Proteinea benchmark selection guide",
},
];
const TARGET_BIOLOGY_KB: KBEntry[] = [
{
id: "tbk-001",
title: "Target class considerations",
content: "Soluble targets (HER2 ECD, TNF-alpha, IL-6): well-suited for all formats, crystal structures widely available. Membrane targets (PD-L1, CD20): require careful epitope selection, VHH preferred for deep cleft access. GPCRs (CXCR7, CXCR4): hardest class — limited epitopes, dynamic conformations, VHH essential for accessing extracellular loops. Hit rates for GPCRs are typically 3-5x lower than soluble targets.",
source: "Proteinea target assessment framework",
},
{
id: "tbk-002",
title: "Competitive landscape awareness",
content: "Before designing: check existing approved antibodies (trastuzumab/HER2, pembrolizumab/PD-1, rituximab/CD20) and their epitopes. Differentiation strategies: novel epitope, higher affinity, better developability, bispecific engagement, enhanced effector function (afucosylation for ADCC). Key databases: IMGT, Thera-SAbDab, DrugBank.",
source: "Proteinea BD intelligence",
},
];
const MODEL_REGISTRY_KB: KBEntry[] = [
{
id: "mrk-001",
title: "Model selection decision tree",
content: "Step 1: Choose backbone generator (RFdiffusion for de novo, ABodyBuilder2 for humanization). Step 2: Choose sequence designer (ProteinMPNN, always). Step 3: Choose scoring stack — for speed: ESMFold (CPU, seconds); for accuracy: AF2 + Chai-1 consensus (GPU, minutes); for production: 3-model consensus (AF2 + Chai-1 + Protenix). Step 4: Choose developability predictor (AbLang for humanness, Agmata for aggregation).",
source: "Proteinea model selection SOP",
},
{
id: "mrk-002",
title: "GPU requirements and scaling",
content: "RFdiffusion: 16GB VRAM, ~30s per backbone. ProteinMPNN: 8GB, ~5s per sequence. AF2: 24GB for full model, 16GB for AF2-multimer-reduced. Chai-1: 24GB, ~2 min per complex. For 50-design campaign: ~45 min on single A100. For 500-design campaign: recommend 4x A100 or cloud batch (RunPod, ~$2/hr per A100).",
source: "Proteinea infrastructure guide",
},
];
// ---- Registry ----
export const KNOWLEDGE_BASES: KnowledgeBase[] = [
{
id: "kb-antibody-design",
name: "Antibody Design",
description: "Design pipeline SOPs, format selection, affinity maturation, developability assessment, epitope mapping. Proteinea's internal best practices.",
type: "embedded",
entryCount: ANTIBODY_DESIGN_KB.length,
defaultEnabled: true,
icon: "🧬",
tags: ["design", "pipeline", "developability"],
},
{
id: "kb-benchmarks",
name: "Benchmark Catalog",
description: "Interpretation guides, deep dives on JAM-2/Chai-2/RFantibody, benchmark selection criteria for different program types.",
type: "embedded",
entryCount: BENCHMARK_KB.length,
defaultEnabled: true,
icon: "📊",
tags: ["benchmarks", "evaluation", "comparison"],
},
{
id: "kb-target-biology",
name: "Target Biology",
description: "Target class considerations, competitive landscape awareness, epitope differentiation strategies.",
type: "embedded",
entryCount: TARGET_BIOLOGY_KB.length,
defaultEnabled: true,
icon: "🎯",
tags: ["targets", "biology", "competitive"],
},
{
id: "kb-model-registry",
name: "Model Registry",
description: "Model selection decision trees, GPU requirements, scaling guidance for production campaigns.",
type: "embedded",
entryCount: MODEL_REGISTRY_KB.length,
defaultEnabled: false,
icon: "🤖",
tags: ["models", "infrastructure", "gpu"],
},
];
// ---- Content map for embedded retrieval ----
const KB_CONTENT: Record<string, KBEntry[]> = {
"kb-antibody-design": ANTIBODY_DESIGN_KB,
"kb-benchmarks": BENCHMARK_KB,
"kb-target-biology": TARGET_BIOLOGY_KB,
"kb-model-registry": MODEL_REGISTRY_KB,
};
/**
* Query a knowledge base by keyword. Returns matching entries ranked by
* a simple keyword-overlap score. For embedded KBs this is instant; for
* future api/vector KBs this would be an async call.
*/
export function queryKB(kbId: string, query: string, limit = 3): KBQueryResult | null {
const kb = KNOWLEDGE_BASES.find((k) => k.id === kbId);
if (!kb) return null;
const entries = KB_CONTENT[kbId];
if (!entries) return null;
const t0 = performance.now();
const q = query.toLowerCase().split(/\s+/).filter(Boolean);
const scored = entries.map((entry) => {
const text = `${entry.title} ${entry.content}`.toLowerCase();
const hits = q.filter((w) => text.includes(w)).length;
const score = q.length > 0 ? hits / q.length : 0;
return { ...entry, score };
});
scored.sort((a, b) => (b.score ?? 0) - (a.score ?? 0));
const top = scored.slice(0, limit).filter((e) => (e.score ?? 0) > 0);
const queryMs = Math.round(performance.now() - t0);
return { kb, entries: top, queryMs };
}
/**
* Query multiple KBs and return all results. This is what the agent calls
* during pre-plan retrieval.
*/
export function queryMultipleKBs(
kbIds: string[],
query: string,
limit = 3,
): KBQueryResult[] {
return kbIds
.map((id) => queryKB(id, query, limit))
.filter((r): r is KBQueryResult => r !== null && r.entries.length > 0);
}
export function listKnowledgeBases(): KnowledgeBase[] {
return KNOWLEDGE_BASES;
}
export function getKnowledgeBase(id: string): KnowledgeBase | null {
return KNOWLEDGE_BASES.find((k) => k.id === id) ?? null;
}
export function getDefaultKBIds(): string[] {
return KNOWLEDGE_BASES.filter((k) => k.defaultEnabled).map((k) => k.id);
}