File size: 10,870 Bytes
30cc31a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
/**
 * Knowledge Base registry. Each KB is a typed, named collection of domain
 * knowledge the agent can query during planning. Users select which KBs are
 * active per-task. The agent emits kb_lookup events with surfaced results
 * and query timings.
 *
 * KB types:
 * - embedded: shipped with the app (static content, zero latency)
 * - api:      external endpoint the agent calls (future β€” e.g., PubMed, ClinicalTrials)
 * - file:     user-uploaded documents indexed for retrieval (future)
 * - vector:   RAG-indexed vector store (future β€” e.g., Proteinea internal docs)
 */

export type KBType = "embedded" | "api" | "file" | "vector";

export interface KnowledgeBase {
  id: string;
  name: string;
  description: string;
  type: KBType;
  /** Number of entries / documents in this KB. */
  entryCount: number;
  /** Whether this KB is enabled by default for new tasks. */
  defaultEnabled: boolean;
  /** Icon hint for the UI (emoji or icon name). */
  icon: string;
  /** Tags for filtering / grouping. */
  tags: string[];
}

export interface KBEntry {
  id: string;
  title: string;
  content: string;
  /** Source reference (paper DOI, URL, file path). */
  source?: string;
  /** Relevance score 0-1 (for ranked retrieval). */
  score?: number;
}

export interface KBQueryResult {
  kb: KnowledgeBase;
  entries: KBEntry[];
  queryMs: number;
}

// ---- Embedded KB content ----

const ANTIBODY_DESIGN_KB: KBEntry[] = [
  {
    id: "abd-001",
    title: "De novo antibody design pipeline",
    content: "Standard pipeline: backbone generation (RFdiffusion) β†’ sequence design (ProteinMPNN) β†’ structure prediction filter (RF2/AF2, iPAE < 10) β†’ scoring consensus (Chai-1 + AF2 + Protenix, 3-model agreement) β†’ developability filter (Agmata aggregation < 0.3, humanness > 0.8). Typical yield: 5-15% of generated backbones pass all filters.",
    source: "Proteinea internal SOP v3.2",
  },
  {
    id: "abd-002",
    title: "Format selection guidelines",
    content: "VHH (nanobody, 15 kDa): best for tissue penetration, no Fc effector functions, ideal for imaging and intracellular targets. scFv (27 kDa): similar penetration, can be reformatted to full IgG. mAb IgG1: full Fc, enables ADCC/CDC, standard for oncology. mAb IgG4: Fc-silent, preferred for blocking/neutralization without cell killing. Bispecific: dual-target engagement, complex manufacturing.",
    source: "Proteinea format decision tree",
  },
  {
    id: "abd-003",
    title: "Affinity maturation strategies",
    content: "In silico maturation: CDR walking with ProteinMPNN (fix framework, randomize CDR positions). Typical improvement: 3-10x affinity gain per round. Library size: 1000-5000 variants per CDR. Scoring: iPAE + binding energy (Rosetta ΔΔG). Key risk: aggregation propensity increases with higher affinity β€” always co-optimize developability.",
    source: "Proteinea maturation protocol",
  },
  {
    id: "abd-004",
    title: "Developability assessment checklist",
    content: "Critical metrics: aggregation propensity (Agmata score < 0.3), thermostability (Tm > 65Β°C for VHH, > 70Β°C for IgG), expression yield (> 0.5 g/L CHO for IgG, > 5 mg/L E.coli for VHH), humanness score (OASis > 0.8), polyreactivity (ELISA panel negative), viscosity (< 20 cP at 150 mg/mL for SC formulation).",
    source: "Proteinea developability SOP",
  },
  {
    id: "abd-005",
    title: "Epitope mapping approaches",
    content: "Computational: docking with ClusPro/HADDOCK, interface residue prediction with Chai-1 iPAE heatmaps. Experimental: HDX-MS (gold standard), alanine scanning, peptide arrays. For therapeutic differentiation: map against known antibody epitopes (trastuzumab epitope on HER2 domain IV, cetuximab on EGFR domain III) to identify novel binding sites.",
    source: "Proteinea epitope mapping guide",
  },
];

const BENCHMARK_KB: KBEntry[] = [
  {
    id: "bkb-001",
    title: "Benchmark interpretation guide",
    content: "Hit rate is the fraction of designs that bind the target (SPR/BLI KD < 100 nM). Affinity is reported as the best single binder. Target coverage is the fraction of tested targets with at least one hit. Key caveat: benchmarks test different targets β€” JAM-2 covers 16 antigens including GPCRs, Chai-2 only 4 soluble. Direct comparison requires matched-target analysis.",
    source: "Proteinea benchmark methodology",
  },
  {
    id: "bkb-002",
    title: "JAM-2 benchmark deep dive",
    content: "Joint Antibody-antigen Model 2 (Dec 2025). 748 designs across 16 antigens. VHH hit rate 39%, mAb 18%. Best affinity: 170 pM (HER2 VHH). Strengths: broad target coverage, joint VH/VL prediction. Weaknesses: not yet open-source, developability pass rate only 57%. The current state-of-the-art for de novo antibody design by hit rate.",
    source: "https://arxiv.org/abs/2512.20605",
  },
  {
    id: "bkb-003",
    title: "When to use which benchmark",
    content: "For VHH-focused programs: JAM-2 (highest hit rate at 39%), RFantibody (open-source, 15%). For developability-critical programs: Chai-2 (86% pass rate vs JAM-2's 57%). For mAb programs: JAM-2 is the only benchmark with mAb-format results. For rapid iteration: DiffAb/dyMEAN have open code but lower hit rates (6-8%).",
    source: "Proteinea benchmark selection guide",
  },
];

const TARGET_BIOLOGY_KB: KBEntry[] = [
  {
    id: "tbk-001",
    title: "Target class considerations",
    content: "Soluble targets (HER2 ECD, TNF-alpha, IL-6): well-suited for all formats, crystal structures widely available. Membrane targets (PD-L1, CD20): require careful epitope selection, VHH preferred for deep cleft access. GPCRs (CXCR7, CXCR4): hardest class β€” limited epitopes, dynamic conformations, VHH essential for accessing extracellular loops. Hit rates for GPCRs are typically 3-5x lower than soluble targets.",
    source: "Proteinea target assessment framework",
  },
  {
    id: "tbk-002",
    title: "Competitive landscape awareness",
    content: "Before designing: check existing approved antibodies (trastuzumab/HER2, pembrolizumab/PD-1, rituximab/CD20) and their epitopes. Differentiation strategies: novel epitope, higher affinity, better developability, bispecific engagement, enhanced effector function (afucosylation for ADCC). Key databases: IMGT, Thera-SAbDab, DrugBank.",
    source: "Proteinea BD intelligence",
  },
];

const MODEL_REGISTRY_KB: KBEntry[] = [
  {
    id: "mrk-001",
    title: "Model selection decision tree",
    content: "Step 1: Choose backbone generator (RFdiffusion for de novo, ABodyBuilder2 for humanization). Step 2: Choose sequence designer (ProteinMPNN, always). Step 3: Choose scoring stack β€” for speed: ESMFold (CPU, seconds); for accuracy: AF2 + Chai-1 consensus (GPU, minutes); for production: 3-model consensus (AF2 + Chai-1 + Protenix). Step 4: Choose developability predictor (AbLang for humanness, Agmata for aggregation).",
    source: "Proteinea model selection SOP",
  },
  {
    id: "mrk-002",
    title: "GPU requirements and scaling",
    content: "RFdiffusion: 16GB VRAM, ~30s per backbone. ProteinMPNN: 8GB, ~5s per sequence. AF2: 24GB for full model, 16GB for AF2-multimer-reduced. Chai-1: 24GB, ~2 min per complex. For 50-design campaign: ~45 min on single A100. For 500-design campaign: recommend 4x A100 or cloud batch (RunPod, ~$2/hr per A100).",
    source: "Proteinea infrastructure guide",
  },
];

// ---- Registry ----

export const KNOWLEDGE_BASES: KnowledgeBase[] = [
  {
    id: "kb-antibody-design",
    name: "Antibody Design",
    description: "Design pipeline SOPs, format selection, affinity maturation, developability assessment, epitope mapping. Proteinea's internal best practices.",
    type: "embedded",
    entryCount: ANTIBODY_DESIGN_KB.length,
    defaultEnabled: true,
    icon: "🧬",
    tags: ["design", "pipeline", "developability"],
  },
  {
    id: "kb-benchmarks",
    name: "Benchmark Catalog",
    description: "Interpretation guides, deep dives on JAM-2/Chai-2/RFantibody, benchmark selection criteria for different program types.",
    type: "embedded",
    entryCount: BENCHMARK_KB.length,
    defaultEnabled: true,
    icon: "πŸ“Š",
    tags: ["benchmarks", "evaluation", "comparison"],
  },
  {
    id: "kb-target-biology",
    name: "Target Biology",
    description: "Target class considerations, competitive landscape awareness, epitope differentiation strategies.",
    type: "embedded",
    entryCount: TARGET_BIOLOGY_KB.length,
    defaultEnabled: true,
    icon: "🎯",
    tags: ["targets", "biology", "competitive"],
  },
  {
    id: "kb-model-registry",
    name: "Model Registry",
    description: "Model selection decision trees, GPU requirements, scaling guidance for production campaigns.",
    type: "embedded",
    entryCount: MODEL_REGISTRY_KB.length,
    defaultEnabled: false,
    icon: "πŸ€–",
    tags: ["models", "infrastructure", "gpu"],
  },
];

// ---- Content map for embedded retrieval ----

const KB_CONTENT: Record<string, KBEntry[]> = {
  "kb-antibody-design": ANTIBODY_DESIGN_KB,
  "kb-benchmarks": BENCHMARK_KB,
  "kb-target-biology": TARGET_BIOLOGY_KB,
  "kb-model-registry": MODEL_REGISTRY_KB,
};

/**
 * Query a knowledge base by keyword. Returns matching entries ranked by
 * a simple keyword-overlap score. For embedded KBs this is instant; for
 * future api/vector KBs this would be an async call.
 */
export function queryKB(kbId: string, query: string, limit = 3): KBQueryResult | null {
  const kb = KNOWLEDGE_BASES.find((k) => k.id === kbId);
  if (!kb) return null;

  const entries = KB_CONTENT[kbId];
  if (!entries) return null;

  const t0 = performance.now();
  const q = query.toLowerCase().split(/\s+/).filter(Boolean);

  const scored = entries.map((entry) => {
    const text = `${entry.title} ${entry.content}`.toLowerCase();
    const hits = q.filter((w) => text.includes(w)).length;
    const score = q.length > 0 ? hits / q.length : 0;
    return { ...entry, score };
  });

  scored.sort((a, b) => (b.score ?? 0) - (a.score ?? 0));
  const top = scored.slice(0, limit).filter((e) => (e.score ?? 0) > 0);
  const queryMs = Math.round(performance.now() - t0);

  return { kb, entries: top, queryMs };
}

/**
 * Query multiple KBs and return all results. This is what the agent calls
 * during pre-plan retrieval.
 */
export function queryMultipleKBs(
  kbIds: string[],
  query: string,
  limit = 3,
): KBQueryResult[] {
  return kbIds
    .map((id) => queryKB(id, query, limit))
    .filter((r): r is KBQueryResult => r !== null && r.entries.length > 0);
}

export function listKnowledgeBases(): KnowledgeBase[] {
  return KNOWLEDGE_BASES;
}

export function getKnowledgeBase(id: string): KnowledgeBase | null {
  return KNOWLEDGE_BASES.find((k) => k.id === id) ?? null;
}

export function getDefaultKBIds(): string[] {
  return KNOWLEDGE_BASES.filter((k) => k.defaultEnabled).map((k) => k.id);
}