Spaces:

meljendy
/

proteinea

Sleeping

proteinea / src /lib /knowledge-bases.ts

Mahmoud Eljendy

feat: Antibody Studio — AI-native antibody design workspace by Proteinea

30cc31a about 1 month ago

10.9 kB

	/**
	* Knowledge Base registry. Each KB is a typed, named collection of domain
	* knowledge the agent can query during planning. Users select which KBs are
	* active per-task. The agent emits kb_lookup events with surfaced results
	* and query timings.
	*
	* KB types:
	* - embedded: shipped with the app (static content, zero latency)
	* - api: external endpoint the agent calls (future — e.g., PubMed, ClinicalTrials)
	* - file: user-uploaded documents indexed for retrieval (future)
	* - vector: RAG-indexed vector store (future — e.g., Proteinea internal docs)
	*/

	export type KBType = "embedded" \| "api" \| "file" \| "vector";

	export interface KnowledgeBase {
	id: string;
	name: string;
	description: string;
	type: KBType;
	/** Number of entries / documents in this KB. */
	entryCount: number;
	/** Whether this KB is enabled by default for new tasks. */
	defaultEnabled: boolean;
	/** Icon hint for the UI (emoji or icon name). */
	icon: string;
	/** Tags for filtering / grouping. */
	tags: string[];
	}

	export interface KBEntry {
	id: string;
	title: string;
	content: string;
	/** Source reference (paper DOI, URL, file path). */
	source?: string;
	/** Relevance score 0-1 (for ranked retrieval). */
	score?: number;
	}

	export interface KBQueryResult {
	kb: KnowledgeBase;
	entries: KBEntry[];
	queryMs: number;
	}

	// ---- Embedded KB content ----

	const ANTIBODY_DESIGN_KB: KBEntry[] = [
	{
	id: "abd-001",
	title: "De novo antibody design pipeline",
	content: "Standard pipeline: backbone generation (RFdiffusion) → sequence design (ProteinMPNN) → structure prediction filter (RF2/AF2, iPAE < 10) → scoring consensus (Chai-1 + AF2 + Protenix, 3-model agreement) → developability filter (Agmata aggregation < 0.3, humanness > 0.8). Typical yield: 5-15% of generated backbones pass all filters.",
	source: "Proteinea internal SOP v3.2",
	},
	{
	id: "abd-002",
	title: "Format selection guidelines",
	content: "VHH (nanobody, 15 kDa): best for tissue penetration, no Fc effector functions, ideal for imaging and intracellular targets. scFv (27 kDa): similar penetration, can be reformatted to full IgG. mAb IgG1: full Fc, enables ADCC/CDC, standard for oncology. mAb IgG4: Fc-silent, preferred for blocking/neutralization without cell killing. Bispecific: dual-target engagement, complex manufacturing.",
	source: "Proteinea format decision tree",
	},
	{
	id: "abd-003",
	title: "Affinity maturation strategies",
	content: "In silico maturation: CDR walking with ProteinMPNN (fix framework, randomize CDR positions). Typical improvement: 3-10x affinity gain per round. Library size: 1000-5000 variants per CDR. Scoring: iPAE + binding energy (Rosetta ΔΔG). Key risk: aggregation propensity increases with higher affinity — always co-optimize developability.",
	source: "Proteinea maturation protocol",
	},
	{
	id: "abd-004",
	title: "Developability assessment checklist",
	content: "Critical metrics: aggregation propensity (Agmata score < 0.3), thermostability (Tm > 65°C for VHH, > 70°C for IgG), expression yield (> 0.5 g/L CHO for IgG, > 5 mg/L E.coli for VHH), humanness score (OASis > 0.8), polyreactivity (ELISA panel negative), viscosity (< 20 cP at 150 mg/mL for SC formulation).",
	source: "Proteinea developability SOP",
	},
	{
	id: "abd-005",
	title: "Epitope mapping approaches",
	content: "Computational: docking with ClusPro/HADDOCK, interface residue prediction with Chai-1 iPAE heatmaps. Experimental: HDX-MS (gold standard), alanine scanning, peptide arrays. For therapeutic differentiation: map against known antibody epitopes (trastuzumab epitope on HER2 domain IV, cetuximab on EGFR domain III) to identify novel binding sites.",
	source: "Proteinea epitope mapping guide",
	},
	];

	const BENCHMARK_KB: KBEntry[] = [
	{
	id: "bkb-001",
	title: "Benchmark interpretation guide",
	content: "Hit rate is the fraction of designs that bind the target (SPR/BLI KD < 100 nM). Affinity is reported as the best single binder. Target coverage is the fraction of tested targets with at least one hit. Key caveat: benchmarks test different targets — JAM-2 covers 16 antigens including GPCRs, Chai-2 only 4 soluble. Direct comparison requires matched-target analysis.",
	source: "Proteinea benchmark methodology",
	},
	{
	id: "bkb-002",
	title: "JAM-2 benchmark deep dive",
	content: "Joint Antibody-antigen Model 2 (Dec 2025). 748 designs across 16 antigens. VHH hit rate 39%, mAb 18%. Best affinity: 170 pM (HER2 VHH). Strengths: broad target coverage, joint VH/VL prediction. Weaknesses: not yet open-source, developability pass rate only 57%. The current state-of-the-art for de novo antibody design by hit rate.",
	source: "https://arxiv.org/abs/2512.20605",
	},
	{
	id: "bkb-003",
	title: "When to use which benchmark",
	content: "For VHH-focused programs: JAM-2 (highest hit rate at 39%), RFantibody (open-source, 15%). For developability-critical programs: Chai-2 (86% pass rate vs JAM-2's 57%). For mAb programs: JAM-2 is the only benchmark with mAb-format results. For rapid iteration: DiffAb/dyMEAN have open code but lower hit rates (6-8%).",
	source: "Proteinea benchmark selection guide",
	},
	];

	const TARGET_BIOLOGY_KB: KBEntry[] = [
	{
	id: "tbk-001",
	title: "Target class considerations",
	content: "Soluble targets (HER2 ECD, TNF-alpha, IL-6): well-suited for all formats, crystal structures widely available. Membrane targets (PD-L1, CD20): require careful epitope selection, VHH preferred for deep cleft access. GPCRs (CXCR7, CXCR4): hardest class — limited epitopes, dynamic conformations, VHH essential for accessing extracellular loops. Hit rates for GPCRs are typically 3-5x lower than soluble targets.",
	source: "Proteinea target assessment framework",
	},
	{
	id: "tbk-002",
	title: "Competitive landscape awareness",
	content: "Before designing: check existing approved antibodies (trastuzumab/HER2, pembrolizumab/PD-1, rituximab/CD20) and their epitopes. Differentiation strategies: novel epitope, higher affinity, better developability, bispecific engagement, enhanced effector function (afucosylation for ADCC). Key databases: IMGT, Thera-SAbDab, DrugBank.",
	source: "Proteinea BD intelligence",
	},
	];

	const MODEL_REGISTRY_KB: KBEntry[] = [
	{
	id: "mrk-001",
	title: "Model selection decision tree",
	content: "Step 1: Choose backbone generator (RFdiffusion for de novo, ABodyBuilder2 for humanization). Step 2: Choose sequence designer (ProteinMPNN, always). Step 3: Choose scoring stack — for speed: ESMFold (CPU, seconds); for accuracy: AF2 + Chai-1 consensus (GPU, minutes); for production: 3-model consensus (AF2 + Chai-1 + Protenix). Step 4: Choose developability predictor (AbLang for humanness, Agmata for aggregation).",
	source: "Proteinea model selection SOP",
	},
	{
	id: "mrk-002",
	title: "GPU requirements and scaling",
	content: "RFdiffusion: 16GB VRAM, ~30s per backbone. ProteinMPNN: 8GB, ~5s per sequence. AF2: 24GB for full model, 16GB for AF2-multimer-reduced. Chai-1: 24GB, ~2 min per complex. For 50-design campaign: ~45 min on single A100. For 500-design campaign: recommend 4x A100 or cloud batch (RunPod, ~$2/hr per A100).",
	source: "Proteinea infrastructure guide",
	},
	];

	// ---- Registry ----

	export const KNOWLEDGE_BASES: KnowledgeBase[] = [
	{
	id: "kb-antibody-design",
	name: "Antibody Design",
	description: "Design pipeline SOPs, format selection, affinity maturation, developability assessment, epitope mapping. Proteinea's internal best practices.",
	type: "embedded",
	entryCount: ANTIBODY_DESIGN_KB.length,
	defaultEnabled: true,
	icon: "🧬",
	tags: ["design", "pipeline", "developability"],
	},
	{
	id: "kb-benchmarks",
	name: "Benchmark Catalog",
	description: "Interpretation guides, deep dives on JAM-2/Chai-2/RFantibody, benchmark selection criteria for different program types.",
	type: "embedded",
	entryCount: BENCHMARK_KB.length,
	defaultEnabled: true,
	icon: "📊",
	tags: ["benchmarks", "evaluation", "comparison"],
	},
	{
	id: "kb-target-biology",
	name: "Target Biology",
	description: "Target class considerations, competitive landscape awareness, epitope differentiation strategies.",
	type: "embedded",
	entryCount: TARGET_BIOLOGY_KB.length,
	defaultEnabled: true,
	icon: "🎯",
	tags: ["targets", "biology", "competitive"],
	},
	{
	id: "kb-model-registry",
	name: "Model Registry",
	description: "Model selection decision trees, GPU requirements, scaling guidance for production campaigns.",
	type: "embedded",
	entryCount: MODEL_REGISTRY_KB.length,
	defaultEnabled: false,
	icon: "🤖",
	tags: ["models", "infrastructure", "gpu"],
	},
	];

	// ---- Content map for embedded retrieval ----

	const KB_CONTENT: Record<string, KBEntry[]> = {
	"kb-antibody-design": ANTIBODY_DESIGN_KB,
	"kb-benchmarks": BENCHMARK_KB,
	"kb-target-biology": TARGET_BIOLOGY_KB,
	"kb-model-registry": MODEL_REGISTRY_KB,
	};

	/**
	* Query a knowledge base by keyword. Returns matching entries ranked by
	* a simple keyword-overlap score. For embedded KBs this is instant; for
	* future api/vector KBs this would be an async call.
	*/
	export function queryKB(kbId: string, query: string, limit = 3): KBQueryResult \| null {
	const kb = KNOWLEDGE_BASES.find((k) => k.id === kbId);
	if (!kb) return null;

	const entries = KB_CONTENT[kbId];
	if (!entries) return null;

	const t0 = performance.now();
	const q = query.toLowerCase().split(/\s+/).filter(Boolean);

	const scored = entries.map((entry) => {
	const text = `${entry.title} ${entry.content}`.toLowerCase();
	const hits = q.filter((w) => text.includes(w)).length;
	const score = q.length > 0 ? hits / q.length : 0;
	return { ...entry, score };
	});

	scored.sort((a, b) => (b.score ?? 0) - (a.score ?? 0));
	const top = scored.slice(0, limit).filter((e) => (e.score ?? 0) > 0);
	const queryMs = Math.round(performance.now() - t0);

	return { kb, entries: top, queryMs };
	}

	/**
	* Query multiple KBs and return all results. This is what the agent calls
	* during pre-plan retrieval.
	*/
	export function queryMultipleKBs(
	kbIds: string[],
	query: string,
	limit = 3,
	): KBQueryResult[] {
	return kbIds
	.map((id) => queryKB(id, query, limit))
	.filter((r): r is KBQueryResult => r !== null && r.entries.length > 0);
	}

	export function listKnowledgeBases(): KnowledgeBase[] {
	return KNOWLEDGE_BASES;
	}

	export function getKnowledgeBase(id: string): KnowledgeBase \| null {
	return KNOWLEDGE_BASES.find((k) => k.id === id) ?? null;
	}

	export function getDefaultKBIds(): string[] {
	return KNOWLEDGE_BASES.filter((k) => k.defaultEnabled).map((k) => k.id);
	}