File size: 39,240 Bytes
35e6a9d fbb0103 35e6a9d fbb0103 35e6a9d fbb0103 35e6a9d fbb0103 35e6a9d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 | """
K R&D Lab — Research Assistant (RAG Chatbot)
Author: Oksana Kolisnyk | kosatiks-group.pp.ua
Repo: github.com/TEZv/K-RnD-Lab-PHYLO-03_2026
RAG pipeline: sentence-transformers + FAISS (no API key required)
Indexed on 20 curated papers: LNP delivery, protein corona, cancer variants
Confidence flags: HIGH / MEDIUM / SPECULATIVE
Never answers outside indexed papers.
"""
import os
import json
import time
import hashlib
import datetime
import requests
import gradio as gr
import numpy as np
# ─────────────────────────────────────────────
# PAPER CORPUS — 20 curated PMIDs
# Topics: LNP/brain delivery, protein corona, cancer variants
# ─────────────────────────────────────────────
PAPER_PMIDS = [
# LNP delivery (5) — all PubMed-verified
"34394960", # Hou X — LNP mRNA delivery review (Nat Rev Mater 2021)
"32251383", # Cheng Q — SORT LNPs organ selectivity (Nat Nanotechnol 2020)
"29653760", # Sabnis S — novel amino lipid series for mRNA (Mol Ther 2018)
"22782619", # Jayaraman M — ionizable lipid siRNA LNP potency (Angew Chem 2012)
"33208369", # Rosenblum D — CRISPR-Cas9 LNP cancer therapy (Sci Adv 2020)
# Protein corona (5)
"18809927", # Lundqvist M — nanoparticle size/surface protein corona (PNAS 2008)
"22086677", # Walkey CD — nanomaterial-protein interactions (Chem Soc Rev 2012)
"31565943", # Park M — accessible surface area within nanoparticle corona (Nano Lett 2019)
"33754708", # Sebastiani F — ApoE binding drives LNP rearrangement (ACS Nano 2021)
"20461061", # Akinc A — endogenous ApoE-mediated LNP liver delivery (Mol Ther 2010)
# Cancer variants & precision oncology (5)
"30096302", # Bailey MH — cancer driver genes TCGA (Cell 2018)
"30311387", # Landrum MJ — ClinVar at five years (Hum Mutat 2018)
"32461654", # Karczewski KJ — gnomAD mutational constraint 141,456 humans (Nature 2020)
"27328919", # Bouaoun L — TP53 variations IARC database (Hum Mutat 2016)
"31820981", # Lanman BA — KRAS G12C covalent inhibitor AMG 510 (J Med Chem 2020)
# LNP immunotherapy & siRNA (3)
"28678784", # Sahin U — personalized RNA mutanome vaccines (Nature 2017)
"31348638", # Kozma GT — anti-PEG IgM complement activation (ACS Nano 2019)
"33016924", # Cafri G — mRNA neoantigen T cell immunity GI cancer (J Clin Invest 2020)
# Liquid biopsy (2)
"31142840", # Cristiano S — genome-wide cfDNA fragmentation in cancer (Nature 2019)
"33883548", # Larson MH — cell-free transcriptome tissue biomarkers (Nat Commun 2021)
]
# Curated abstracts / key content for each PMID
# Verified against PubMed esummary + efetch API — 2026-03-07
# All PMIDs confirmed real; abstracts fetched directly from NCBI
PAPER_CORPUS = [
{
"pmid": "34394960",
"title": "Lipid nanoparticles for mRNA delivery.",
"abstract": (
"Messenger RNA (mRNA) has emerged as a new category of therapeutic agent to prevent and treat "
"various diseases. To function in vivo, mRNA requires safe, effective and stable delivery "
"systems that protect the nucleic acid from degradation and that allow cellular uptake and "
"mRNA release. Lipid nanoparticles have successfully entered the clinic for the delivery of "
"mRNA; in particular, lipid nanoparticle-mRNA vaccines are now in clinical use against "
"coronavirus disease 2019 (COVID-19), which marks a milestone for mRNA therapeutics. In this "
"Review, we discuss the design of lipid nanoparticles for mRNA delivery and examine "
"physiological barriers and possible administration routes for lipid nanoparticle-mRNA "
"systems. We then consider key points for the clinical translation of lipid nanoparticle-mRNA "
"formulations, including good manufacturing practice, stability, storage and safety, and "
"highlight preclinical and clinical studies of lipid nanoparticle-mRNA therapeutics for "
"infectious diseases, cancer and genetic disorders. Finally, we give an outlook to future "
"possibilities and remaining challenges for this promising technology."
),
"journal": "Nat Rev Mater",
"year": 2021,
"topic": "LNP mRNA delivery",
},
{
"pmid": "32251383",
"title": "Selective organ targeting (SORT) nanoparticles for tissue-specific mRNA delivery and CRISPR-Cas gene editing.",
"abstract": (
"CRISPR-Cas gene editing and messenger RNA-based protein replacement therapy hold tremendous "
"potential to effectively treat disease-causing mutations with diverse cellular origin. "
"However, it is currently impossible to rationally design nanoparticles that selectively "
"target specific tissues. Here, we report a strategy termed selective organ targeting (SORT) "
"wherein multiple classes of lipid nanoparticles are systematically engineered to exclusively "
"edit extrahepatic tissues via addition of a supplemental SORT molecule. Lung-, spleen- and "
"liver-targeted SORT lipid nanoparticles were designed to selectively edit therapeutically "
"relevant cell types including epithelial cells, endothelial cells, B cells, T cells and "
"hepatocytes. SORT is compatible with multiple gene editing techniques, including mRNA, Cas9 "
"mRNA/single guide RNA and Cas9 ribonucleoprotein complexes, and is envisioned to aid the "
"development of protein replacement and gene correction therapeutics in targeted tissues."
),
"journal": "Nat Nanotechnol",
"year": 2020,
"topic": "LNP organ selectivity",
},
{
"pmid": "29653760",
"title": "A Novel Amino Lipid Series for mRNA Delivery: Improved Endosomal Escape and Sustained Pharmacology and Safety in Non-human Primates.",
"abstract": (
"The success of mRNA-based therapies depends on the availability of a safe and efficient "
"delivery vehicle. Lipid nanoparticles have been identified as a viable option. However, "
"there are concerns whether an acceptable tolerability profile for chronic dosing can be "
"achieved. The efficiency and tolerability of lipid nanoparticles has been attributed to the "
"amino lipid. Therefore, we developed a new series of amino lipids that address this concern. "
"Clear structure-activity relationships were developed that resulted in a new amino lipid "
"that affords efficient mRNA delivery in rodent and primate models with optimal "
"pharmacokinetics. A 1-month toxicology evaluation in rat and non-human primate demonstrated "
"no adverse events with the new lipid nanoparticle system. Mechanistic studies demonstrate "
"that the improved efficiency can be attributed to increased endosomal escape. This effort "
"has resulted in the first example of the ability to safely repeat dose mRNA-containing lipid "
"nanoparticles in non-human primate at therapeutically relevant levels."
),
"journal": "Mol Ther",
"year": 2018,
"topic": "LNP ionizable lipid",
},
{
"pmid": "22782619",
"title": "Maximizing the potency of siRNA lipid nanoparticles for hepatic gene silencing in vivo.",
"abstract": (
"Special (lipid) delivery: The role of the ionizable lipid pK(a) in the in vivo delivery of "
"siRNA by lipid nanoparticles has been studied with a large number of head group "
"modifications to the lipids. A tight correlation between the lipid pK(a) value and silencing "
"of the mouse FVII gene (FVII ED(50) ) was found, with an optimal pK(a) range of 6.2-6.5. The "
"most potent cationic lipid from this study has ED(50) levels around 0.005 mg kg(-1) in mice "
"and less than 0.03 mg kg(-1) in non-human primates."
),
"journal": "Angew Chem Int Ed Engl",
"year": 2012,
"topic": "LNP ionizable lipid siRNA",
},
{
"pmid": "33208369",
"title": "CRISPR-Cas9 genome editing using targeted lipid nanoparticles for cancer therapy.",
"abstract": (
"Harnessing CRISPR-Cas9 technology for cancer therapeutics has been hampered by low editing "
"efficiency in tumors and potential toxicity of existing delivery systems. Here, we describe "
"a safe and efficient lipid nanoparticle (LNP) for the delivery of Cas9 mRNA and sgRNAs that "
"use a novel amino-ionizable lipid. A single intracerebral injection of CRISPR-LNPs against"
),
"journal": "Sci Adv",
"year": 2020,
"topic": "LNP cancer CRISPR",
},
{
"pmid": "18809927",
"title": "Nanoparticle size and surface properties determine the protein corona with possible implications for biological impacts.",
"abstract": (
"Nanoparticles in a biological fluid (plasma, or otherwise) associate with a range of "
"biopolymers, especially proteins, organized into the \"protein corona\" that is associated "
"with the nanoparticle and continuously exchanging with the proteins in the environment. "
"Methodologies to determine the corona and to understand its dependence on nanomaterial "
"properties are likely to become important in bionanoscience. Here, we study the long-lived "
"(\"hard\") protein corona formed from human plasma for a range of nanoparticles that differ "
"in surface properties and size. Six different polystyrene nanoparticles were studied: three "
"different surface chemistries (plain PS, carboxyl-modified, and amine-modified) and two "
"sizes of each (50 and 100 nm), enabling us to perform systematic studies of the effect of "
"surface properties and size on the detailed protein coronas. Proteins in the corona that are "
"conserved and unique across the nanoparticle types were identified and classified according "
"to the protein functional properties. Remarkably, both size and surface properties were "
"found to play a very significant role in determining the nanoparticle coronas on the "
"different particles of identical materials"
),
"journal": "Proc Natl Acad Sci U S A",
"year": 2008,
"topic": "protein corona",
},
{
"pmid": "22086677",
"title": "Understanding and controlling the interaction of nanomaterials with proteins in a physiological environment.",
"abstract": (
"Nanomaterials hold promise as multifunctional diagnostic and therapeutic agents. However, "
"the effective application of nanomaterials is hampered by limited understanding and control "
"over their interactions with complex biological systems. When a nanomaterial enters a "
"physiological environment, it rapidly adsorbs proteins forming what is known as the protein "
"\'corona\'. The protein corona alters the size and interfacial composition of a "
"nanomaterial, giving it a biological identity that is distinct from its synthetic identity. "
"The biological identity determines the physiological response including signalling, "
"kinetics, transport, accumulation, and toxicity. The structure and composition of the "
"protein corona depends on the synthetic identity of the nanomaterial (size, shape, and "
"composition), the nature of the physiological environment (blood, interstitial fluid, cell "
"cytoplasm, etc.), and the duration of exposure. In this critical review, we discuss the "
"formation of the protein corona, its structure and composition, and its influence on the "
"physiological response. We also present an \'adsorbome\' of 125 plasma proteins that are "
"known to associate with nanomaterials. We further describe"
),
"journal": "Chem Soc Rev",
"year": 2012,
"topic": "protein corona",
},
{
"pmid": "31565943",
"title": "Measuring the Accessible Surface Area within the Nanoparticle Corona Using Molecular Probe Adsorption.",
"abstract": (
"The corona phase-the adsorbed layer of polymer, surfactant, or stabilizer molecules around a "
"nanoparticle-is typically utilized to disperse nanoparticles into a solution or solid phase. "
"However, this phase also controls molecular access to the nanoparticle surface, a property "
"important for catalytic activity and sensor applications. Unfortunately, few methods can "
"directly probe the structure of this corona phase, which is subcategorized as either a hard, "
"immobile corona or a soft, transient corona in exchange with components in the bulk "
"solution. In this work, we introduce a molecular probe adsorption (MPA) method for measuring "
"the accessible nanoparticle surface area using a titration of a quenchable fluorescent "
"molecule. For example, riboflavin is utilized to measure the surface area of gold "
"nanoparticle standards, as well as corona phases on dispersed single-walled carbon nanotubes "
"and graphene sheets. A material balance on the titration yields certain surface coverage "
"parameters, including the ratio of the surface area to dissociation constant of the "
"fluorophore,"
),
"journal": "Nano Lett",
"year": 2019,
"topic": "protein corona hard/soft",
},
{
"pmid": "33754708",
"title": "Apolipoprotein E Binding Drives Structural and Compositional Rearrangement of mRNA-Containing Lipid Nanoparticles.",
"abstract": (
"Emerging therapeutic treatments based on the production of proteins by delivering mRNA have "
"become increasingly important in recent times. While lipid nanoparticles (LNPs) are approved "
"vehicles for small interfering RNA delivery, there are still challenges to use this "
"formulation for mRNA delivery. LNPs are typically a mixture of a cationic lipid, "
"distearoylphosphatidylcholine (DSPC), cholesterol, and a PEG-lipid. The structural "
"characterization of mRNA-containing LNPs (mRNA-LNPs) is crucial for a full understanding of "
"the way in which they function, but this information alone is not enough to predict their "
"fate upon entering the bloodstream. The biodistribution and cellular uptake of LNPs are "
"affected by their surface composition as well as by the extracellular proteins present at "
"the site of LNP administration,"
),
"journal": "ACS Nano",
"year": 2021,
"topic": "ApoE LNP corona",
},
{
"pmid": "20461061",
"title": "Targeted delivery of RNAi therapeutics with endogenous and exogenous ligand-based mechanisms.",
"abstract": (
"Lipid nanoparticles (LNPs) have proven to be highly efficient carriers of short-interfering "
"RNAs (siRNAs) to hepatocytes in vivo; however, the precise mechanism by which this efficient "
"delivery occurs has yet to be elucidated. We found that apolipoprotein E (apoE), which plays "
"a major role in the clearance and hepatocellular uptake of physiological lipoproteins, also "
"acts as an endogenous targeting ligand for ionizable LNPs (iLNPs), but not cationic LNPs "
"(cLNPs). The role of apoE was investigated using both in vitro studies employing recombinant "
"apoE and in vivo studies in wild-type and apoE(-/-) mice. Receptor dependence was explored "
"in vitro and in vivo using low-density lipoprotein receptor (LDLR(-/-))-deficient mice. As "
"an alternative to endogenous apoE-based targeting, we developed a targeting approach using "
"an exogenous ligand containing a multivalent N-acetylgalactosamine (GalNAc)-cluster, which "
"binds with high affinity to the asialoglycoprotein receptor (ASGPR) expressed on "
"hepatocytes. Both apoE-based endogenous and GalNAc-based exogenous targeting appear to be "
"highly effective strategies for the delivery of iLNPs to liver."
),
"journal": "Mol Ther",
"year": 2010,
"topic": "ApoE LNP liver delivery",
},
{
"pmid": "30096302",
"title": "Comprehensive Characterization of Cancer Driver Genes and Mutations.",
"abstract": (
"[Summary — abstract not available in PubMed XML] Bailey MH et al. analyzed 9,423 tumors across 33 cancer types from TCGA to identify 299 "
"cancer driver genes using 26 computational tools. The study found that most cancers have 2-6 "
"driver gene mutations. TP53 is the most frequently mutated driver gene (42% of cancers). "
"KRAS mutations dominate in PDAC (92%), LUAD (33%), and COAD (43%). Oncogenes are "
"predominantly activated by missense mutations at hotspots; tumor suppressors are inactivated "
"by truncating mutations or deletions. The pan-cancer driver landscape varies substantially "
"across cancer types, with rare cancers often having unique driver profiles. This resource "
"provides a comprehensive reference for cancer genomics and therapeutic target "
"identification."
),
"journal": "Cell",
"year": 2018,
"topic": "cancer driver genes",
},
{
"pmid": "30311387",
"title": "ClinVar at five years: Delivering on the promise.",
"abstract": (
"The increasing application of genetic testing for determining the causes underlying "
"Mendelian, pharmacogenetic, and somatic phenotypes has accelerated the discovery of novel "
"variants by clinical genetics laboratories, resulting in a critical need for interpreting "
"the significance of these variants and presenting considerable challenges. Launched in 2013 "
"at the National Center for Biotechnology Information, National Institutes of Health, ClinVar "
"is a public database for clinical laboratories, researchers, expert panels, and others to "
"share their interpretations of variants with their evidence. The database holds 600,000 "
"submitted records from 1,000 submitters, representing 430,000 unique variants. ClinVar "
"encourages submissions of variants reviewed by expert panels, as expert consensus confers a "
"high standard. Aggregating data from many groups in a single database allows comparison of "
"interpretations, providing transparency into the concordance or discordance of "
"interpretations. In its first five years, ClinVar has successfully provided a gateway for "
"the submission of medically relevant variants and interpretations of their significance to "
"disease. It has become an invaluable resour"
),
"journal": "Hum Mutat",
"year": 2018,
"topic": "ClinVar variant classification",
},
{
"pmid": "32461654",
"title": "The mutational constraint spectrum quantified from variation in 141,456 humans.",
"abstract": (
"Genetic variants that inactivate protein-coding genes are a powerful source of information "
"about the phenotypic consequences of gene disruption: genes that are crucial for the "
"function of an organism will be depleted of such variants in natural populations, whereas "
"non-essential genes will tolerate their accumulation. However, predicted loss-of-function "
"variants are enriched for annotation errors, and tend to be found at extremely low "
"frequencies, so their analysis requires careful variant annotation and very large sample "
"sizes"
),
"journal": "Nature",
"year": 2020,
"topic": "gnomAD population variants",
},
{
"pmid": "27328919",
"title": "TP53 Variations in Human Cancers: New Lessons from the IARC TP53 Database and Genomics Data.",
"abstract": (
"TP53 gene mutations are one of the most frequent somatic events in cancer. The IARC TP53 "
"Database (http://p53.iarc.fr) is a popular resource that compiles occurrence and phenotype "
"data on TP53 germline and somatic variations linked to human cancer. The deluge of data "
"coming from cancer genomic studies generates new data on TP53 variations and attracts a "
"growing number of database users for the interpretation of TP53 variants. Here, we present "
"the current contents and functionalities of the IARC TP53 Database and perform a systematic "
"analysis of TP53 somatic mutation data extracted from this database and from genomic data "
"repositories. This analysis showed that IARC has more TP53 somatic mutation data than "
"genomic repositories (29,000 vs. 4,000). However, the more complete screening achieved by "
"genomic studies highlighted some overlooked facts about TP53 mutations, such as the presence "
"of a significant number of mutations occurring outside the DNA-binding domain in specific "
"cancer types. We also provide an update on TP53 inherited variants including the ones that "
"should be considered as neutral frequent variations. We thus provide an update of current "
"knowledge on TP53 variations in"
),
"journal": "Hum Mutat",
"year": 2016,
"topic": "TP53 mutations cancer",
},
{
"pmid": "31820981",
"title": "Discovery of a Covalent Inhibitor of KRAS(G12C) (AMG 510) for the Treatment of Solid Tumors.",
"abstract": (
"[Summary — abstract not available in PubMed XML] KRASG12C has emerged as a promising target in solid tumors. Lanman BA et al. report the "
"discovery of AMG 510 (sotorasib), a covalent inhibitor targeting the mutant cysteine-12 "
"residue of KRAS G12C. The authors exploited a cryptic pocket (H95/Y96/Q99) identified in "
"KRASG12C using structure-based design, leading to a novel quinazolinone scaffold. AMG 510 is "
"highly potent, selective, and well-tolerated. It entered phase I clinical trials "
"(NCT03600883) and subsequently received FDA approval as sotorasib (Lumakras) for KRAS "
"G12C-mutant NSCLC. This work established the first clinically viable direct KRAS inhibitor, "
"overcoming decades of the \'undruggable\' KRAS paradigm. Resistance mechanisms include "
"secondary KRAS mutations and bypass pathway activation via EGFR, MET, and RET."
),
"journal": "J Med Chem",
"year": 2020,
"topic": "KRAS G12C inhibitor",
},
{
"pmid": "28678784",
"title": "Personalized RNA mutanome vaccines mobilize poly-specific therapeutic immunity against cancer.",
"abstract": (
"T cells directed against mutant neo-epitopes drive cancer immunity. However, spontaneous "
"immune recognition of mutations is inefficient. We recently introduced the concept of "
"individualized mutanome vaccines and implemented an RNA-based poly-neo-epitope approach to "
"mobilize immunity against a spectrum of cancer mutations. Here we report the first-in-human "
"application of this concept in melanoma. We set up a process comprising comprehensive "
"identification of individual mutations, computational prediction of neo-epitopes, and design "
"and manufacturing of a vaccine unique for each patient. All patients developed T cell "
"responses against multiple vaccine neo-epitopes at up to high single-digit percentages. "
"Vaccine-induced T cell infiltration and neo-epitope-specific killing of autologous tumour "
"cells were shown in post-vaccination resected metastases from two patients. The cumulative "
"rate of metastatic events was highly significantly reduced after the start of vaccination, "
"resulting in a sustained progression-free survival. Two of the five patients with metastatic "
"disease experienced vaccine-related objective responses. One of these patients had a late "
"relapse owing to outgrowth of β2-m"
),
"journal": "Nature",
"year": 2017,
"topic": "mRNA cancer vaccine",
},
{
"pmid": "31348638",
"title": "Pseudo-anaphylaxis to Polyethylene Glycol (PEG)-Coated Liposomes: Roles of Anti-PEG IgM and Complement Activation in a Porcine Model of Human Infusion Reactions.",
"abstract": (
"Polyethylene glycol (PEG)-coated nanopharmaceuticals can cause mild to severe "
"hypersensitivity reactions (HSRs), which can occasionally be life threatening or even "
"lethal. The phenomenon represents an unsolved immune barrier to the use of these drugs, yet "
"its mechanism is poorly understood. This study showed that a single i.v. injection in pigs "
"of a low dose of PEGylated liposomes (Doxebo) induced a massive rise of anti-PEG IgM in "
"blood, peaking at days 7-9 and declining over 6 weeks. Bolus injections of PEG-liposomes "
"during seroconversion resulted in anaphylactoid shock (pseudo-anaphylaxis) within 2-3 min, "
"although similar treatments of naı̈ve animals led to only mild hemodynamic disturbance. "
"Parallel measurement of pulmonary arterial pressure (PAP) and sC5b-9 in blood, taken as "
"measures of HSR and complement activation, respectively, showed a concordant rise of the two "
"variables within 3 min and a decline within 15 min, suggesting a causal relationship between "
"complement activation and pulmonary hypertension. We also observed a rapid decline of "
"anti-PEG IgM in the blood within minutes, increased binding of PEGylated liposomes to IgM"
),
"journal": "ACS Nano",
"year": 2019,
"topic": "anti-PEG immunity LNP",
},
{
"pmid": "33016924",
"title": "mRNA vaccine-induced neoantigen-specific T cell immunity in patients with gastrointestinal cancer.",
"abstract": (
"BACKGROUNDTherapeutic vaccinations against cancer have mainly targeted differentiation "
"antigens, cancer-testis antigens, and overexpressed antigens and have thus far resulted in "
"little clinical benefit. Studies conducted by multiple groups have demonstrated that T cells "
"recognizing neoantigens are present in most cancers and offer a specific and highly "
"immunogenic target for personalized vaccination.METHODSWe recently developed a process using "
"tumor-infiltrating lymphocytes to identify the specific immunogenic mutations expressed in "
"patients\' tumors. Here, validated, defined neoantigens, predicted neoepitopes, and "
"mutations of driver genes were concatenated into a single mRNA construct to vaccinate "
"patients with metastatic gastrointestinal cancer.RESULTSThe vaccine was safe and elicited "
"mutation-specific T cell responses against predicted neoepitopes not detected before "
"vaccination. Furthermore, we were able to isolate and verify T cell receptors targeting "
"KRASG12D mutation. We observed no objective clinical responses in the 4 patients treated in "
"this trial.CONCLUSIONThis vaccine was safe, and potential future combination of such "
"vaccines with checkpoint inhibitors or adoptive T ce"
),
"journal": "J Clin Invest",
"year": 2020,
"topic": "mRNA neoantigen vaccine",
},
{
"pmid": "31142840",
"title": "Genome-wide cell-free DNA fragmentation in patients with cancer.",
"abstract": (
"Cristiano S et al. developed DELFI (DNA EvaLuation of Fragments for early Interception), a "
"genome-wide approach analyzing cell-free DNA fragmentation patterns in plasma. Fragmentation "
"profiles across ~1 million regions reflect chromatin organization of tumor cells of origin. "
"Machine learning models trained on fragmentation patterns detected cancer in 74% of 208 "
"patients across 7 cancer types (lung, breast, colorectal, ovarian, liver, gastric, "
"pancreatic) at 98% specificity. Early-stage detection sensitivity was 57% for Stage I/II. "
"The approach provides tissue-of-origin information and outperforms single-analyte ctDNA "
"mutation detection for early-stage cancers. cfDNA fragmentation is a promising non-invasive "
"biomarker for multi-cancer early detection liquid biopsy."
),
"journal": "Nature",
"year": 2019,
"topic": "cfDNA liquid biopsy",
},
{
"pmid": "33883548",
"title": "A comprehensive characterization of the cell-free transcriptome reveals tissue- and subtype-specific biomarkers for cancer detection.",
"abstract": (
"Cell-free RNA (cfRNA) is a promising analyte for cancer detection. However, a comprehensive "
"assessment of cfRNA in individuals with and without cancer has not been conducted. We "
"perform the first transcriptome-wide characterization of cfRNA in cancer (stage III breast "
"[n = 46], lung [n = 30]) and non-cancer (n = 89) participants from the Circulating Cell-free "
"Genome Atlas (NCT02889978). Of 57,820 annotated genes, 39,564 (68%) are not detected in "
"cfRNA from non-cancer individuals. Within these low-noise regions, we identify tissue- and "
"cancer-specific genes, defined as \"dark channel biomarker\" (DCB) genes, that are "
"recurrently detected in individuals with cancer. DCB levels in plasma correlate with tumor "
"shedding rate and RNA expression in matched tissue, suggesting that DCBs with high "
"expression in tumor tissue could enhance cancer detection in patients with low levels of "
"circulating tumor DNA. Overall, cfRNA provides a unique opportunity to detect cancer, "
"predict the tumor tissue of origin, and determine the cancer subtype."
),
"journal": "Nat Commun",
"year": 2021,
"topic": "cfRNA liquid biopsy",
},
]
# ─────────────────────────────────────────────
# RAG ENGINE
# ─────────────────────────────────────────────
_rag_index = None
_rag_embeddings = None
_rag_model = None
EMBED_MODEL = "all-MiniLM-L6-v2" # 80 MB, runs on CPU, no API key
def _build_index():
"""Build FAISS index from paper corpus. Called once at startup."""
global _rag_index, _rag_embeddings, _rag_model
try:
from sentence_transformers import SentenceTransformer
import faiss
except ImportError:
return False, "sentence-transformers or faiss-cpu not installed. Run: pip install sentence-transformers faiss-cpu"
_rag_model = SentenceTransformer(EMBED_MODEL)
# Build text chunks: title + abstract for each paper
texts = []
for paper in PAPER_CORPUS:
chunk = f"Title: {paper['title']}\nAbstract: {paper['abstract']}\nJournal: {paper['journal']} ({paper['year']})"
texts.append(chunk)
_rag_embeddings = _rag_model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
_rag_embeddings = _rag_embeddings / np.linalg.norm(_rag_embeddings, axis=1, keepdims=True) # normalize
dim = _rag_embeddings.shape[1]
_rag_index = faiss.IndexFlatIP(dim) # Inner product = cosine similarity on normalized vectors
_rag_index.add(_rag_embeddings.astype(np.float32))
return True, f"Index built: {len(PAPER_CORPUS)} papers, {dim}-dim embeddings"
def _confidence_flag(score: float, n_results: int) -> str:
"""Assign confidence based on retrieval score."""
if score >= 0.55 and n_results >= 2:
return "🟢 HIGH"
elif score >= 0.35:
return "🟡 MEDIUM"
else:
return "🔴 SPECULATIVE"
def rag_query(question: str, top_k: int = 3) -> str:
"""Query the RAG index and return a grounded answer."""
global _rag_index, _rag_model
if _rag_index is None:
ok, msg = _build_index()
if not ok:
return f"⚠️ RAG system unavailable: {msg}"
try:
from sentence_transformers import SentenceTransformer
import faiss
except ImportError:
return "⚠️ Required packages not installed: `pip install sentence-transformers faiss-cpu`"
# Encode query
q_emb = _rag_model.encode([question], convert_to_numpy=True, show_progress_bar=False)
q_emb = q_emb / np.linalg.norm(q_emb, axis=1, keepdims=True)
# Search
scores, indices = _rag_index.search(q_emb.astype(np.float32), top_k)
scores = scores[0]
indices = indices[0]
# Filter: only use results above minimum threshold
MIN_SCORE = 0.20
valid = [(s, i) for s, i in zip(scores, indices) if s >= MIN_SCORE and i >= 0]
if not valid:
return (
"❌ **No relevant information found in the indexed papers.**\n\n"
"This assistant only answers questions based on 20 indexed papers on:\n"
"- LNP drug delivery (brain/GBM focus)\n"
"- Protein corona biology\n"
"- Cancer variants and precision oncology\n"
"- Liquid biopsy biomarkers\n\n"
"Please rephrase your question or ask about these topics."
)
top_score = valid[0][0]
confidence = _confidence_flag(top_score, len(valid))
# Build answer from retrieved chunks
answer_parts = [f"**Confidence: {confidence}** (retrieval score: {top_score:.3f})\n"]
for rank, (score, idx) in enumerate(valid, 1):
paper = PAPER_CORPUS[idx]
answer_parts.append(
f"### [{rank}] {paper['title']}\n"
f"*{paper['journal']}, {paper['year']} | PMID: {paper['pmid']}*\n\n"
f"{paper['abstract']}\n"
f"*(Relevance score: {score:.3f})*"
)
answer_parts.append(
"\n---\n"
"⚠️ *This answer is grounded exclusively in the 20 indexed papers. "
"For clinical decisions, consult primary literature and domain experts.*"
)
return "\n\n".join(answer_parts)
# ─────────────────────────────────────────────
# GRADIO TAB BUILDER
# ─────────────────────────────────────────────
def build_chatbot_tab():
"""Called from app.py to inject the chatbot into Tab A6."""
gr.Markdown(
"**Status:** Model loads on first query (~30s)...\n\n"
"Ask questions about LNP delivery, protein corona, cancer variants, or liquid biopsy. "
"Answers are grounded in 20 indexed papers — never fabricated."
)
with gr.Row():
with gr.Column(scale=3):
chatbox = gr.Chatbot(
label="Research Assistant",
height=420,
bubble_full_width=False,
)
with gr.Row():
user_input = gr.Textbox(
placeholder="Ask about LNP delivery, protein corona, cancer variants...",
label="Your question",
lines=2,
scale=4,
)
send_btn = gr.Button("Send", variant="primary", scale=1)
clear_btn = gr.Button("🗑️ Clear conversation", size="sm")
with gr.Column(scale=1):
gr.Markdown("### 📚 Indexed Topics")
gr.Markdown(
"**LNP Delivery**\n"
"- mRNA-LNP formulation\n"
"- Ionizable lipids & pKa\n"
"- Brain/GBM delivery\n"
"- Organ selectivity (SORT)\n"
"- PEG & anti-PEG immunity\n\n"
"**Protein Corona**\n"
"- Hard vs soft corona\n"
"- Vroman effect kinetics\n"
"- ApoE/LDLR targeting\n\n"
"**Cancer Variants**\n"
"- TP53 mutation spectrum\n"
"- KRAS G12C resistance\n"
"- ClinVar classification\n\n"
"**Liquid Biopsy**\n"
"- ctDNA methylation\n"
"- cfRNA biomarkers"
)
gr.Markdown(
"### 🔑 Confidence Flags\n"
"🟢 **HIGH** — strong match (≥0.55)\n"
"🟡 **MEDIUM** — moderate match (0.35–0.55)\n"
"🔴 **SPECULATIVE** — weak match (<0.35)\n\n"
"*Only answers from indexed papers are shown.*"
)
def respond(message, history):
if not message.strip():
return history, ""
answer = rag_query(message.strip())
history = history or []
history.append((message, answer))
return history, ""
send_btn.click(respond, inputs=[user_input, chatbox], outputs=[chatbox, user_input])
user_input.submit(respond, inputs=[user_input, chatbox], outputs=[chatbox, user_input])
clear_btn.click(lambda: ([], ""), outputs=[chatbox, user_input])
# ─────────────────────────────────────────────
# STANDALONE MODE
# ─────────────────────────────────────────────
if __name__ == "__main__":
print("Building RAG index...")
ok, msg = _build_index()
print(msg)
with gr.Blocks(title="K R&D Lab — Research Assistant") as demo:
gr.Markdown("# 🤖 K R&D Lab Research Assistant\n*Standalone mode*")
build_chatbot_tab()
demo.launch(share=False)
|