Spaces:

meljendy
/

proteinea

Sleeping

File size: 10,870 Bytes

30cc31a

/**
 * Knowledge Base registry. Each KB is a typed, named collection of domain
 * knowledge the agent can query during planning. Users select which KBs are
 * active per-task. The agent emits kb_lookup events with surfaced results
 * and query timings.
 *
 * KB types:
 * - embedded: shipped with the app (static content, zero latency)
 * - api:      external endpoint the agent calls (future — e.g., PubMed, ClinicalTrials)
 * - file:     user-uploaded documents indexed for retrieval (future)
 * - vector:   RAG-indexed vector store (future — e.g., Proteinea internal docs)
 */

export type KBType = "embedded" | "api" | "file" | "vector";

export interface KnowledgeBase {
  id: string;
  name: string;
  description: string;
  type: KBType;
  /** Number of entries / documents in this KB. */
  entryCount: number;
  /** Whether this KB is enabled by default for new tasks. */
  defaultEnabled: boolean;
  /** Icon hint for the UI (emoji or icon name). */
  icon: string;
  /** Tags for filtering / grouping. */
  tags: string[];
}

export interface KBEntry {
  id: string;
  title: string;
  content: string;
  /** Source reference (paper DOI, URL, file path). */
  source?: string;
  /** Relevance score 0-1 (for ranked retrieval). */
  score?: number;
}

export interface KBQueryResult {
  kb: KnowledgeBase;
  entries: KBEntry[];
  queryMs: number;
}

// ---- Embedded KB content ----

const ANTIBODY_DESIGN_KB: KBEntry[] = [
  {
    id: "abd-001",
    title: "De novo antibody design pipeline",
    content: "Standard pipeline: backbone generation (RFdiffusion) → sequence design (ProteinMPNN) → structure prediction filter (RF2/AF2, iPAE < 10) → scoring consensus (Chai-1 + AF2 + Protenix, 3-model agreement) → developability filter (Agmata aggregation < 0.3, humanness > 0.8). Typical yield: 5-15% of generated backbones pass all filters.",
    source: "Proteinea internal SOP v3.2",
  },
  {
    id: "abd-002",
    title: "Format selection guidelines",
    content: "VHH (nanobody, 15 kDa): best for tissue penetration, no Fc effector functions, ideal for imaging and intracellular targets. scFv (27 kDa): similar penetration, can be reformatted to full IgG. mAb IgG1: full Fc, enables ADCC/CDC, standard for oncology. mAb IgG4: Fc-silent, preferred for blocking/neutralization without cell killing. Bispecific: dual-target engagement, complex manufacturing.",
    source: "Proteinea format decision tree",
  },
  {
    id: "abd-003",
    title: "Affinity maturation strategies",
    content: "In silico maturation: CDR walking with ProteinMPNN (fix framework, randomize CDR positions). Typical improvement: 3-10x affinity gain per round. Library size: 1000-5000 variants per CDR. Scoring: iPAE + binding energy (Rosetta ΔΔG). Key risk: aggregation propensity increases with higher affinity — always co-optimize developability.",
    source: "Proteinea maturation protocol",
  },
  {
    id: "abd-004",
    title: "Developability assessment checklist",
    content: "Critical metrics: aggregation propensity (Agmata score < 0.3), thermostability (Tm > 65°C for VHH, > 70°C for IgG), expression yield (> 0.5 g/L CHO for IgG, > 5 mg/L E.coli for VHH), humanness score (OASis > 0.8), polyreactivity (ELISA panel negative), viscosity (< 20 cP at 150 mg/mL for SC formulation).",
    source: "Proteinea developability SOP",
  },
  {
    id: "abd-005",
    title: "Epitope mapping approaches",
    content: "Computational: docking with ClusPro/HADDOCK, interface residue prediction with Chai-1 iPAE heatmaps. Experimental: HDX-MS (gold standard), alanine scanning, peptide arrays. For therapeutic differentiation: map against known antibody epitopes (trastuzumab epitope on HER2 domain IV, cetuximab on EGFR domain III) to identify novel binding sites.",
    source: "Proteinea epitope mapping guide",
  },
];

const BENCHMARK_KB: KBEntry[] = [
  {
    id: "bkb-001",
    title: "Benchmark interpretation guide",
    content: "Hit rate is the fraction of designs that bind the target (SPR/BLI KD < 100 nM). Affinity is reported as the best single binder. Target coverage is the fraction of tested targets with at least one hit. Key caveat: benchmarks test different targets — JAM-2 covers 16 antigens including GPCRs, Chai-2 only 4 soluble. Direct comparison requires matched-target analysis.",
    source: "Proteinea benchmark methodology",
  },
  {
    id: "bkb-002",
    title: "JAM-2 benchmark deep dive",
    content: "Joint Antibody-antigen Model 2 (Dec 2025). 748 designs across 16 antigens. VHH hit rate 39%, mAb 18%. Best affinity: 170 pM (HER2 VHH). Strengths: broad target coverage, joint VH/VL prediction. Weaknesses: not yet open-source, developability pass rate only 57%. The current state-of-the-art for de novo antibody design by hit rate.",
    source: "https://arxiv.org/abs/2512.20605",
  },
  {
    id: "bkb-003",
    title: "When to use which benchmark",
    content: "For VHH-focused programs: JAM-2 (highest hit rate at 39%), RFantibody (open-source, 15%). For developability-critical programs: Chai-2 (86% pass rate vs JAM-2's 57%). For mAb programs: JAM-2 is the only benchmark with mAb-format results. For rapid iteration: DiffAb/dyMEAN have open code but lower hit rates (6-8%).",
    source: "Proteinea benchmark selection guide",
  },
];

const TARGET_BIOLOGY_KB: KBEntry[] = [
  {
    id: "tbk-001",
    title: "Target class considerations",
    content: "Soluble targets (HER2 ECD, TNF-alpha, IL-6): well-suited for all formats, crystal structures widely available. Membrane targets (PD-L1, CD20): require careful epitope selection, VHH preferred for deep cleft access. GPCRs (CXCR7, CXCR4): hardest class — limited epitopes, dynamic conformations, VHH essential for accessing extracellular loops. Hit rates for GPCRs are typically 3-5x lower than soluble targets.",
    source: "Proteinea target assessment framework",
  },
  {
    id: "tbk-002",
    title: "Competitive landscape awareness",
    content: "Before designing: check existing approved antibodies (trastuzumab/HER2, pembrolizumab/PD-1, rituximab/CD20) and their epitopes. Differentiation strategies: novel epitope, higher affinity, better developability, bispecific engagement, enhanced effector function (afucosylation for ADCC). Key databases: IMGT, Thera-SAbDab, DrugBank.",
    source: "Proteinea BD intelligence",
  },
];

const MODEL_REGISTRY_KB: KBEntry[] = [
  {
    id: "mrk-001",
    title: "Model selection decision tree",
    content: "Step 1: Choose backbone generator (RFdiffusion for de novo, ABodyBuilder2 for humanization). Step 2: Choose sequence designer (ProteinMPNN, always). Step 3: Choose scoring stack — for speed: ESMFold (CPU, seconds); for accuracy: AF2 + Chai-1 consensus (GPU, minutes); for production: 3-model consensus (AF2 + Chai-1 + Protenix). Step 4: Choose developability predictor (AbLang for humanness, Agmata for aggregation).",
    source: "Proteinea model selection SOP",
  },
  {
    id: "mrk-002",
    title: "GPU requirements and scaling",
    content: "RFdiffusion: 16GB VRAM, ~30s per backbone. ProteinMPNN: 8GB, ~5s per sequence. AF2: 24GB for full model, 16GB for AF2-multimer-reduced. Chai-1: 24GB, ~2 min per complex. For 50-design campaign: ~45 min on single A100. For 500-design campaign: recommend 4x A100 or cloud batch (RunPod, ~$2/hr per A100).",
    source: "Proteinea infrastructure guide",
  },
];

// ---- Registry ----

export const KNOWLEDGE_BASES: KnowledgeBase[] = [
  {
    id: "kb-antibody-design",
    name: "Antibody Design",
    description: "Design pipeline SOPs, format selection, affinity maturation, developability assessment, epitope mapping. Proteinea's internal best practices.",
    type: "embedded",
    entryCount: ANTIBODY_DESIGN_KB.length,
    defaultEnabled: true,
    icon: "🧬",
    tags: ["design", "pipeline", "developability"],
  },
  {
    id: "kb-benchmarks",
    name: "Benchmark Catalog",
    description: "Interpretation guides, deep dives on JAM-2/Chai-2/RFantibody, benchmark selection criteria for different program types.",
    type: "embedded",
    entryCount: BENCHMARK_KB.length,
    defaultEnabled: true,
    icon: "📊",
    tags: ["benchmarks", "evaluation", "comparison"],
  },
  {
    id: "kb-target-biology",
    name: "Target Biology",
    description: "Target class considerations, competitive landscape awareness, epitope differentiation strategies.",
    type: "embedded",
    entryCount: TARGET_BIOLOGY_KB.length,
    defaultEnabled: true,
    icon: "🎯",
    tags: ["targets", "biology", "competitive"],
  },
  {
    id: "kb-model-registry",
    name: "Model Registry",
    description: "Model selection decision trees, GPU requirements, scaling guidance for production campaigns.",
    type: "embedded",
    entryCount: MODEL_REGISTRY_KB.length,
    defaultEnabled: false,
    icon: "🤖",
    tags: ["models", "infrastructure", "gpu"],
  },
];

// ---- Content map for embedded retrieval ----

const KB_CONTENT: Record<string, KBEntry[]> = {
  "kb-antibody-design": ANTIBODY_DESIGN_KB,
  "kb-benchmarks": BENCHMARK_KB,
  "kb-target-biology": TARGET_BIOLOGY_KB,
  "kb-model-registry": MODEL_REGISTRY_KB,
};

/**
 * Query a knowledge base by keyword. Returns matching entries ranked by
 * a simple keyword-overlap score. For embedded KBs this is instant; for
 * future api/vector KBs this would be an async call.
 */
export function queryKB(kbId: string, query: string, limit = 3): KBQueryResult | null {
  const kb = KNOWLEDGE_BASES.find((k) => k.id === kbId);
  if (!kb) return null;

  const entries = KB_CONTENT[kbId];
  if (!entries) return null;

  const t0 = performance.now();
  const q = query.toLowerCase().split(/\s+/).filter(Boolean);

  const scored = entries.map((entry) => {
    const text = `${entry.title} ${entry.content}`.toLowerCase();
    const hits = q.filter((w) => text.includes(w)).length;
    const score = q.length > 0 ? hits / q.length : 0;
    return { ...entry, score };
  });

  scored.sort((a, b) => (b.score ?? 0) - (a.score ?? 0));
  const top = scored.slice(0, limit).filter((e) => (e.score ?? 0) > 0);
  const queryMs = Math.round(performance.now() - t0);

  return { kb, entries: top, queryMs };
}

/**
 * Query multiple KBs and return all results. This is what the agent calls
 * during pre-plan retrieval.
 */
export function queryMultipleKBs(
  kbIds: string[],
  query: string,
  limit = 3,
): KBQueryResult[] {
  return kbIds
    .map((id) => queryKB(id, query, limit))
    .filter((r): r is KBQueryResult => r !== null && r.entries.length > 0);
}

export function listKnowledgeBases(): KnowledgeBase[] {
  return KNOWLEDGE_BASES;
}

export function getKnowledgeBase(id: string): KnowledgeBase | null {
  return KNOWLEDGE_BASES.find((k) => k.id === id) ?? null;
}

export function getDefaultKBIds(): string[] {
  return KNOWLEDGE_BASES.filter((k) => k.defaultEnabled).map((k) => k.id);
}