| """ |
| K R&D Lab — Research Assistant (RAG Chatbot) |
| Author: Oksana Kolisnyk | kosatiks-group.pp.ua |
| Repo: github.com/TEZv/K-RnD-Lab-PHYLO-03_2026 |
| |
| RAG pipeline: sentence-transformers + FAISS (no API key required) |
| Indexed on 20 curated papers: LNP delivery, protein corona, cancer variants |
| Confidence flags: HIGH / MEDIUM / SPECULATIVE |
| Never answers outside indexed papers. |
| """ |
|
|
| import os |
| import json |
| import time |
| import hashlib |
| import datetime |
| import requests |
| import gradio as gr |
| import numpy as np |
|
|
| |
| |
| |
| |
|
|
| PAPER_PMIDS = [ |
| |
| "34394960", |
| "32251383", |
| "29653760", |
| "22782619", |
| "33208369", |
| |
| "18809927", |
| "22086677", |
| "31565943", |
| "33754708", |
| "20461061", |
| |
| "30096302", |
| "30311387", |
| "32461654", |
| "27328919", |
| "31820981", |
| |
| "28678784", |
| "31348638", |
| "33016924", |
| |
| "31142840", |
| "33883548", |
| ] |
|
|
| |
| |
| |
| PAPER_CORPUS = [ |
| { |
| "pmid": "34394960", |
| "title": "Lipid nanoparticles for mRNA delivery.", |
| "abstract": ( |
| "Messenger RNA (mRNA) has emerged as a new category of therapeutic agent to prevent and treat " |
| "various diseases. To function in vivo, mRNA requires safe, effective and stable delivery " |
| "systems that protect the nucleic acid from degradation and that allow cellular uptake and " |
| "mRNA release. Lipid nanoparticles have successfully entered the clinic for the delivery of " |
| "mRNA; in particular, lipid nanoparticle-mRNA vaccines are now in clinical use against " |
| "coronavirus disease 2019 (COVID-19), which marks a milestone for mRNA therapeutics. In this " |
| "Review, we discuss the design of lipid nanoparticles for mRNA delivery and examine " |
| "physiological barriers and possible administration routes for lipid nanoparticle-mRNA " |
| "systems. We then consider key points for the clinical translation of lipid nanoparticle-mRNA " |
| "formulations, including good manufacturing practice, stability, storage and safety, and " |
| "highlight preclinical and clinical studies of lipid nanoparticle-mRNA therapeutics for " |
| "infectious diseases, cancer and genetic disorders. Finally, we give an outlook to future " |
| "possibilities and remaining challenges for this promising technology." |
| ), |
| "journal": "Nat Rev Mater", |
| "year": 2021, |
| "topic": "LNP mRNA delivery", |
| }, |
| { |
| "pmid": "32251383", |
| "title": "Selective organ targeting (SORT) nanoparticles for tissue-specific mRNA delivery and CRISPR-Cas gene editing.", |
| "abstract": ( |
| "CRISPR-Cas gene editing and messenger RNA-based protein replacement therapy hold tremendous " |
| "potential to effectively treat disease-causing mutations with diverse cellular origin. " |
| "However, it is currently impossible to rationally design nanoparticles that selectively " |
| "target specific tissues. Here, we report a strategy termed selective organ targeting (SORT) " |
| "wherein multiple classes of lipid nanoparticles are systematically engineered to exclusively " |
| "edit extrahepatic tissues via addition of a supplemental SORT molecule. Lung-, spleen- and " |
| "liver-targeted SORT lipid nanoparticles were designed to selectively edit therapeutically " |
| "relevant cell types including epithelial cells, endothelial cells, B cells, T cells and " |
| "hepatocytes. SORT is compatible with multiple gene editing techniques, including mRNA, Cas9 " |
| "mRNA/single guide RNA and Cas9 ribonucleoprotein complexes, and is envisioned to aid the " |
| "development of protein replacement and gene correction therapeutics in targeted tissues." |
| ), |
| "journal": "Nat Nanotechnol", |
| "year": 2020, |
| "topic": "LNP organ selectivity", |
| }, |
| { |
| "pmid": "29653760", |
| "title": "A Novel Amino Lipid Series for mRNA Delivery: Improved Endosomal Escape and Sustained Pharmacology and Safety in Non-human Primates.", |
| "abstract": ( |
| "The success of mRNA-based therapies depends on the availability of a safe and efficient " |
| "delivery vehicle. Lipid nanoparticles have been identified as a viable option. However, " |
| "there are concerns whether an acceptable tolerability profile for chronic dosing can be " |
| "achieved. The efficiency and tolerability of lipid nanoparticles has been attributed to the " |
| "amino lipid. Therefore, we developed a new series of amino lipids that address this concern. " |
| "Clear structure-activity relationships were developed that resulted in a new amino lipid " |
| "that affords efficient mRNA delivery in rodent and primate models with optimal " |
| "pharmacokinetics. A 1-month toxicology evaluation in rat and non-human primate demonstrated " |
| "no adverse events with the new lipid nanoparticle system. Mechanistic studies demonstrate " |
| "that the improved efficiency can be attributed to increased endosomal escape. This effort " |
| "has resulted in the first example of the ability to safely repeat dose mRNA-containing lipid " |
| "nanoparticles in non-human primate at therapeutically relevant levels." |
| ), |
| "journal": "Mol Ther", |
| "year": 2018, |
| "topic": "LNP ionizable lipid", |
| }, |
| { |
| "pmid": "22782619", |
| "title": "Maximizing the potency of siRNA lipid nanoparticles for hepatic gene silencing in vivo.", |
| "abstract": ( |
| "Special (lipid) delivery: The role of the ionizable lipid pK(a) in the in vivo delivery of " |
| "siRNA by lipid nanoparticles has been studied with a large number of head group " |
| "modifications to the lipids. A tight correlation between the lipid pK(a) value and silencing " |
| "of the mouse FVII gene (FVII ED(50) ) was found, with an optimal pK(a) range of 6.2-6.5. The " |
| "most potent cationic lipid from this study has ED(50) levels around 0.005 mg kg(-1) in mice " |
| "and less than 0.03 mg kg(-1) in non-human primates." |
| ), |
| "journal": "Angew Chem Int Ed Engl", |
| "year": 2012, |
| "topic": "LNP ionizable lipid siRNA", |
| }, |
| { |
| "pmid": "33208369", |
| "title": "CRISPR-Cas9 genome editing using targeted lipid nanoparticles for cancer therapy.", |
| "abstract": ( |
| "Harnessing CRISPR-Cas9 technology for cancer therapeutics has been hampered by low editing " |
| "efficiency in tumors and potential toxicity of existing delivery systems. Here, we describe " |
| "a safe and efficient lipid nanoparticle (LNP) for the delivery of Cas9 mRNA and sgRNAs that " |
| "use a novel amino-ionizable lipid. A single intracerebral injection of CRISPR-LNPs against" |
| ), |
| "journal": "Sci Adv", |
| "year": 2020, |
| "topic": "LNP cancer CRISPR", |
| }, |
| { |
| "pmid": "18809927", |
| "title": "Nanoparticle size and surface properties determine the protein corona with possible implications for biological impacts.", |
| "abstract": ( |
| "Nanoparticles in a biological fluid (plasma, or otherwise) associate with a range of " |
| "biopolymers, especially proteins, organized into the \"protein corona\" that is associated " |
| "with the nanoparticle and continuously exchanging with the proteins in the environment. " |
| "Methodologies to determine the corona and to understand its dependence on nanomaterial " |
| "properties are likely to become important in bionanoscience. Here, we study the long-lived " |
| "(\"hard\") protein corona formed from human plasma for a range of nanoparticles that differ " |
| "in surface properties and size. Six different polystyrene nanoparticles were studied: three " |
| "different surface chemistries (plain PS, carboxyl-modified, and amine-modified) and two " |
| "sizes of each (50 and 100 nm), enabling us to perform systematic studies of the effect of " |
| "surface properties and size on the detailed protein coronas. Proteins in the corona that are " |
| "conserved and unique across the nanoparticle types were identified and classified according " |
| "to the protein functional properties. Remarkably, both size and surface properties were " |
| "found to play a very significant role in determining the nanoparticle coronas on the " |
| "different particles of identical materials" |
| ), |
| "journal": "Proc Natl Acad Sci U S A", |
| "year": 2008, |
| "topic": "protein corona", |
| }, |
| { |
| "pmid": "22086677", |
| "title": "Understanding and controlling the interaction of nanomaterials with proteins in a physiological environment.", |
| "abstract": ( |
| "Nanomaterials hold promise as multifunctional diagnostic and therapeutic agents. However, " |
| "the effective application of nanomaterials is hampered by limited understanding and control " |
| "over their interactions with complex biological systems. When a nanomaterial enters a " |
| "physiological environment, it rapidly adsorbs proteins forming what is known as the protein " |
| "\'corona\'. The protein corona alters the size and interfacial composition of a " |
| "nanomaterial, giving it a biological identity that is distinct from its synthetic identity. " |
| "The biological identity determines the physiological response including signalling, " |
| "kinetics, transport, accumulation, and toxicity. The structure and composition of the " |
| "protein corona depends on the synthetic identity of the nanomaterial (size, shape, and " |
| "composition), the nature of the physiological environment (blood, interstitial fluid, cell " |
| "cytoplasm, etc.), and the duration of exposure. In this critical review, we discuss the " |
| "formation of the protein corona, its structure and composition, and its influence on the " |
| "physiological response. We also present an \'adsorbome\' of 125 plasma proteins that are " |
| "known to associate with nanomaterials. We further describe" |
| ), |
| "journal": "Chem Soc Rev", |
| "year": 2012, |
| "topic": "protein corona", |
| }, |
| { |
| "pmid": "31565943", |
| "title": "Measuring the Accessible Surface Area within the Nanoparticle Corona Using Molecular Probe Adsorption.", |
| "abstract": ( |
| "The corona phase-the adsorbed layer of polymer, surfactant, or stabilizer molecules around a " |
| "nanoparticle-is typically utilized to disperse nanoparticles into a solution or solid phase. " |
| "However, this phase also controls molecular access to the nanoparticle surface, a property " |
| "important for catalytic activity and sensor applications. Unfortunately, few methods can " |
| "directly probe the structure of this corona phase, which is subcategorized as either a hard, " |
| "immobile corona or a soft, transient corona in exchange with components in the bulk " |
| "solution. In this work, we introduce a molecular probe adsorption (MPA) method for measuring " |
| "the accessible nanoparticle surface area using a titration of a quenchable fluorescent " |
| "molecule. For example, riboflavin is utilized to measure the surface area of gold " |
| "nanoparticle standards, as well as corona phases on dispersed single-walled carbon nanotubes " |
| "and graphene sheets. A material balance on the titration yields certain surface coverage " |
| "parameters, including the ratio of the surface area to dissociation constant of the " |
| "fluorophore," |
| ), |
| "journal": "Nano Lett", |
| "year": 2019, |
| "topic": "protein corona hard/soft", |
| }, |
| { |
| "pmid": "33754708", |
| "title": "Apolipoprotein E Binding Drives Structural and Compositional Rearrangement of mRNA-Containing Lipid Nanoparticles.", |
| "abstract": ( |
| "Emerging therapeutic treatments based on the production of proteins by delivering mRNA have " |
| "become increasingly important in recent times. While lipid nanoparticles (LNPs) are approved " |
| "vehicles for small interfering RNA delivery, there are still challenges to use this " |
| "formulation for mRNA delivery. LNPs are typically a mixture of a cationic lipid, " |
| "distearoylphosphatidylcholine (DSPC), cholesterol, and a PEG-lipid. The structural " |
| "characterization of mRNA-containing LNPs (mRNA-LNPs) is crucial for a full understanding of " |
| "the way in which they function, but this information alone is not enough to predict their " |
| "fate upon entering the bloodstream. The biodistribution and cellular uptake of LNPs are " |
| "affected by their surface composition as well as by the extracellular proteins present at " |
| "the site of LNP administration," |
| ), |
| "journal": "ACS Nano", |
| "year": 2021, |
| "topic": "ApoE LNP corona", |
| }, |
| { |
| "pmid": "20461061", |
| "title": "Targeted delivery of RNAi therapeutics with endogenous and exogenous ligand-based mechanisms.", |
| "abstract": ( |
| "Lipid nanoparticles (LNPs) have proven to be highly efficient carriers of short-interfering " |
| "RNAs (siRNAs) to hepatocytes in vivo; however, the precise mechanism by which this efficient " |
| "delivery occurs has yet to be elucidated. We found that apolipoprotein E (apoE), which plays " |
| "a major role in the clearance and hepatocellular uptake of physiological lipoproteins, also " |
| "acts as an endogenous targeting ligand for ionizable LNPs (iLNPs), but not cationic LNPs " |
| "(cLNPs). The role of apoE was investigated using both in vitro studies employing recombinant " |
| "apoE and in vivo studies in wild-type and apoE(-/-) mice. Receptor dependence was explored " |
| "in vitro and in vivo using low-density lipoprotein receptor (LDLR(-/-))-deficient mice. As " |
| "an alternative to endogenous apoE-based targeting, we developed a targeting approach using " |
| "an exogenous ligand containing a multivalent N-acetylgalactosamine (GalNAc)-cluster, which " |
| "binds with high affinity to the asialoglycoprotein receptor (ASGPR) expressed on " |
| "hepatocytes. Both apoE-based endogenous and GalNAc-based exogenous targeting appear to be " |
| "highly effective strategies for the delivery of iLNPs to liver." |
| ), |
| "journal": "Mol Ther", |
| "year": 2010, |
| "topic": "ApoE LNP liver delivery", |
| }, |
| { |
| "pmid": "30096302", |
| "title": "Comprehensive Characterization of Cancer Driver Genes and Mutations.", |
| "abstract": ( |
| "[Summary — abstract not available in PubMed XML] Bailey MH et al. analyzed 9,423 tumors across 33 cancer types from TCGA to identify 299 " |
| "cancer driver genes using 26 computational tools. The study found that most cancers have 2-6 " |
| "driver gene mutations. TP53 is the most frequently mutated driver gene (42% of cancers). " |
| "KRAS mutations dominate in PDAC (92%), LUAD (33%), and COAD (43%). Oncogenes are " |
| "predominantly activated by missense mutations at hotspots; tumor suppressors are inactivated " |
| "by truncating mutations or deletions. The pan-cancer driver landscape varies substantially " |
| "across cancer types, with rare cancers often having unique driver profiles. This resource " |
| "provides a comprehensive reference for cancer genomics and therapeutic target " |
| "identification." |
| ), |
| "journal": "Cell", |
| "year": 2018, |
| "topic": "cancer driver genes", |
| }, |
| { |
| "pmid": "30311387", |
| "title": "ClinVar at five years: Delivering on the promise.", |
| "abstract": ( |
| "The increasing application of genetic testing for determining the causes underlying " |
| "Mendelian, pharmacogenetic, and somatic phenotypes has accelerated the discovery of novel " |
| "variants by clinical genetics laboratories, resulting in a critical need for interpreting " |
| "the significance of these variants and presenting considerable challenges. Launched in 2013 " |
| "at the National Center for Biotechnology Information, National Institutes of Health, ClinVar " |
| "is a public database for clinical laboratories, researchers, expert panels, and others to " |
| "share their interpretations of variants with their evidence. The database holds 600,000 " |
| "submitted records from 1,000 submitters, representing 430,000 unique variants. ClinVar " |
| "encourages submissions of variants reviewed by expert panels, as expert consensus confers a " |
| "high standard. Aggregating data from many groups in a single database allows comparison of " |
| "interpretations, providing transparency into the concordance or discordance of " |
| "interpretations. In its first five years, ClinVar has successfully provided a gateway for " |
| "the submission of medically relevant variants and interpretations of their significance to " |
| "disease. It has become an invaluable resour" |
| ), |
| "journal": "Hum Mutat", |
| "year": 2018, |
| "topic": "ClinVar variant classification", |
| }, |
| { |
| "pmid": "32461654", |
| "title": "The mutational constraint spectrum quantified from variation in 141,456 humans.", |
| "abstract": ( |
| "Genetic variants that inactivate protein-coding genes are a powerful source of information " |
| "about the phenotypic consequences of gene disruption: genes that are crucial for the " |
| "function of an organism will be depleted of such variants in natural populations, whereas " |
| "non-essential genes will tolerate their accumulation. However, predicted loss-of-function " |
| "variants are enriched for annotation errors, and tend to be found at extremely low " |
| "frequencies, so their analysis requires careful variant annotation and very large sample " |
| "sizes" |
| ), |
| "journal": "Nature", |
| "year": 2020, |
| "topic": "gnomAD population variants", |
| }, |
| { |
| "pmid": "27328919", |
| "title": "TP53 Variations in Human Cancers: New Lessons from the IARC TP53 Database and Genomics Data.", |
| "abstract": ( |
| "TP53 gene mutations are one of the most frequent somatic events in cancer. The IARC TP53 " |
| "Database (http://p53.iarc.fr) is a popular resource that compiles occurrence and phenotype " |
| "data on TP53 germline and somatic variations linked to human cancer. The deluge of data " |
| "coming from cancer genomic studies generates new data on TP53 variations and attracts a " |
| "growing number of database users for the interpretation of TP53 variants. Here, we present " |
| "the current contents and functionalities of the IARC TP53 Database and perform a systematic " |
| "analysis of TP53 somatic mutation data extracted from this database and from genomic data " |
| "repositories. This analysis showed that IARC has more TP53 somatic mutation data than " |
| "genomic repositories (29,000 vs. 4,000). However, the more complete screening achieved by " |
| "genomic studies highlighted some overlooked facts about TP53 mutations, such as the presence " |
| "of a significant number of mutations occurring outside the DNA-binding domain in specific " |
| "cancer types. We also provide an update on TP53 inherited variants including the ones that " |
| "should be considered as neutral frequent variations. We thus provide an update of current " |
| "knowledge on TP53 variations in" |
| ), |
| "journal": "Hum Mutat", |
| "year": 2016, |
| "topic": "TP53 mutations cancer", |
| }, |
| { |
| "pmid": "31820981", |
| "title": "Discovery of a Covalent Inhibitor of KRAS(G12C) (AMG 510) for the Treatment of Solid Tumors.", |
| "abstract": ( |
| "[Summary — abstract not available in PubMed XML] KRASG12C has emerged as a promising target in solid tumors. Lanman BA et al. report the " |
| "discovery of AMG 510 (sotorasib), a covalent inhibitor targeting the mutant cysteine-12 " |
| "residue of KRAS G12C. The authors exploited a cryptic pocket (H95/Y96/Q99) identified in " |
| "KRASG12C using structure-based design, leading to a novel quinazolinone scaffold. AMG 510 is " |
| "highly potent, selective, and well-tolerated. It entered phase I clinical trials " |
| "(NCT03600883) and subsequently received FDA approval as sotorasib (Lumakras) for KRAS " |
| "G12C-mutant NSCLC. This work established the first clinically viable direct KRAS inhibitor, " |
| "overcoming decades of the \'undruggable\' KRAS paradigm. Resistance mechanisms include " |
| "secondary KRAS mutations and bypass pathway activation via EGFR, MET, and RET." |
| ), |
| "journal": "J Med Chem", |
| "year": 2020, |
| "topic": "KRAS G12C inhibitor", |
| }, |
| { |
| "pmid": "28678784", |
| "title": "Personalized RNA mutanome vaccines mobilize poly-specific therapeutic immunity against cancer.", |
| "abstract": ( |
| "T cells directed against mutant neo-epitopes drive cancer immunity. However, spontaneous " |
| "immune recognition of mutations is inefficient. We recently introduced the concept of " |
| "individualized mutanome vaccines and implemented an RNA-based poly-neo-epitope approach to " |
| "mobilize immunity against a spectrum of cancer mutations. Here we report the first-in-human " |
| "application of this concept in melanoma. We set up a process comprising comprehensive " |
| "identification of individual mutations, computational prediction of neo-epitopes, and design " |
| "and manufacturing of a vaccine unique for each patient. All patients developed T cell " |
| "responses against multiple vaccine neo-epitopes at up to high single-digit percentages. " |
| "Vaccine-induced T cell infiltration and neo-epitope-specific killing of autologous tumour " |
| "cells were shown in post-vaccination resected metastases from two patients. The cumulative " |
| "rate of metastatic events was highly significantly reduced after the start of vaccination, " |
| "resulting in a sustained progression-free survival. Two of the five patients with metastatic " |
| "disease experienced vaccine-related objective responses. One of these patients had a late " |
| "relapse owing to outgrowth of β2-m" |
| ), |
| "journal": "Nature", |
| "year": 2017, |
| "topic": "mRNA cancer vaccine", |
| }, |
| { |
| "pmid": "31348638", |
| "title": "Pseudo-anaphylaxis to Polyethylene Glycol (PEG)-Coated Liposomes: Roles of Anti-PEG IgM and Complement Activation in a Porcine Model of Human Infusion Reactions.", |
| "abstract": ( |
| "Polyethylene glycol (PEG)-coated nanopharmaceuticals can cause mild to severe " |
| "hypersensitivity reactions (HSRs), which can occasionally be life threatening or even " |
| "lethal. The phenomenon represents an unsolved immune barrier to the use of these drugs, yet " |
| "its mechanism is poorly understood. This study showed that a single i.v. injection in pigs " |
| "of a low dose of PEGylated liposomes (Doxebo) induced a massive rise of anti-PEG IgM in " |
| "blood, peaking at days 7-9 and declining over 6 weeks. Bolus injections of PEG-liposomes " |
| "during seroconversion resulted in anaphylactoid shock (pseudo-anaphylaxis) within 2-3 min, " |
| "although similar treatments of naı̈ve animals led to only mild hemodynamic disturbance. " |
| "Parallel measurement of pulmonary arterial pressure (PAP) and sC5b-9 in blood, taken as " |
| "measures of HSR and complement activation, respectively, showed a concordant rise of the two " |
| "variables within 3 min and a decline within 15 min, suggesting a causal relationship between " |
| "complement activation and pulmonary hypertension. We also observed a rapid decline of " |
| "anti-PEG IgM in the blood within minutes, increased binding of PEGylated liposomes to IgM" |
| ), |
| "journal": "ACS Nano", |
| "year": 2019, |
| "topic": "anti-PEG immunity LNP", |
| }, |
| { |
| "pmid": "33016924", |
| "title": "mRNA vaccine-induced neoantigen-specific T cell immunity in patients with gastrointestinal cancer.", |
| "abstract": ( |
| "BACKGROUNDTherapeutic vaccinations against cancer have mainly targeted differentiation " |
| "antigens, cancer-testis antigens, and overexpressed antigens and have thus far resulted in " |
| "little clinical benefit. Studies conducted by multiple groups have demonstrated that T cells " |
| "recognizing neoantigens are present in most cancers and offer a specific and highly " |
| "immunogenic target for personalized vaccination.METHODSWe recently developed a process using " |
| "tumor-infiltrating lymphocytes to identify the specific immunogenic mutations expressed in " |
| "patients\' tumors. Here, validated, defined neoantigens, predicted neoepitopes, and " |
| "mutations of driver genes were concatenated into a single mRNA construct to vaccinate " |
| "patients with metastatic gastrointestinal cancer.RESULTSThe vaccine was safe and elicited " |
| "mutation-specific T cell responses against predicted neoepitopes not detected before " |
| "vaccination. Furthermore, we were able to isolate and verify T cell receptors targeting " |
| "KRASG12D mutation. We observed no objective clinical responses in the 4 patients treated in " |
| "this trial.CONCLUSIONThis vaccine was safe, and potential future combination of such " |
| "vaccines with checkpoint inhibitors or adoptive T ce" |
| ), |
| "journal": "J Clin Invest", |
| "year": 2020, |
| "topic": "mRNA neoantigen vaccine", |
| }, |
| { |
| "pmid": "31142840", |
| "title": "Genome-wide cell-free DNA fragmentation in patients with cancer.", |
| "abstract": ( |
| "Cristiano S et al. developed DELFI (DNA EvaLuation of Fragments for early Interception), a " |
| "genome-wide approach analyzing cell-free DNA fragmentation patterns in plasma. Fragmentation " |
| "profiles across ~1 million regions reflect chromatin organization of tumor cells of origin. " |
| "Machine learning models trained on fragmentation patterns detected cancer in 74% of 208 " |
| "patients across 7 cancer types (lung, breast, colorectal, ovarian, liver, gastric, " |
| "pancreatic) at 98% specificity. Early-stage detection sensitivity was 57% for Stage I/II. " |
| "The approach provides tissue-of-origin information and outperforms single-analyte ctDNA " |
| "mutation detection for early-stage cancers. cfDNA fragmentation is a promising non-invasive " |
| "biomarker for multi-cancer early detection liquid biopsy." |
| ), |
| "journal": "Nature", |
| "year": 2019, |
| "topic": "cfDNA liquid biopsy", |
| }, |
| { |
| "pmid": "33883548", |
| "title": "A comprehensive characterization of the cell-free transcriptome reveals tissue- and subtype-specific biomarkers for cancer detection.", |
| "abstract": ( |
| "Cell-free RNA (cfRNA) is a promising analyte for cancer detection. However, a comprehensive " |
| "assessment of cfRNA in individuals with and without cancer has not been conducted. We " |
| "perform the first transcriptome-wide characterization of cfRNA in cancer (stage III breast " |
| "[n = 46], lung [n = 30]) and non-cancer (n = 89) participants from the Circulating Cell-free " |
| "Genome Atlas (NCT02889978). Of 57,820 annotated genes, 39,564 (68%) are not detected in " |
| "cfRNA from non-cancer individuals. Within these low-noise regions, we identify tissue- and " |
| "cancer-specific genes, defined as \"dark channel biomarker\" (DCB) genes, that are " |
| "recurrently detected in individuals with cancer. DCB levels in plasma correlate with tumor " |
| "shedding rate and RNA expression in matched tissue, suggesting that DCBs with high " |
| "expression in tumor tissue could enhance cancer detection in patients with low levels of " |
| "circulating tumor DNA. Overall, cfRNA provides a unique opportunity to detect cancer, " |
| "predict the tumor tissue of origin, and determine the cancer subtype." |
| ), |
| "journal": "Nat Commun", |
| "year": 2021, |
| "topic": "cfRNA liquid biopsy", |
| }, |
| ] |
|
|
| |
| |
| |
|
|
| _rag_index = None |
| _rag_embeddings = None |
| _rag_model = None |
|
|
| EMBED_MODEL = "all-MiniLM-L6-v2" |
|
|
|
|
| def _build_index(): |
| """Build FAISS index from paper corpus. Called once at startup.""" |
| global _rag_index, _rag_embeddings, _rag_model |
|
|
| try: |
| from sentence_transformers import SentenceTransformer |
| import faiss |
| except ImportError: |
| return False, "sentence-transformers or faiss-cpu not installed. Run: pip install sentence-transformers faiss-cpu" |
|
|
| _rag_model = SentenceTransformer(EMBED_MODEL) |
|
|
| |
| texts = [] |
| for paper in PAPER_CORPUS: |
| chunk = f"Title: {paper['title']}\nAbstract: {paper['abstract']}\nJournal: {paper['journal']} ({paper['year']})" |
| texts.append(chunk) |
|
|
| _rag_embeddings = _rag_model.encode(texts, convert_to_numpy=True, show_progress_bar=False) |
| _rag_embeddings = _rag_embeddings / np.linalg.norm(_rag_embeddings, axis=1, keepdims=True) |
|
|
| dim = _rag_embeddings.shape[1] |
| _rag_index = faiss.IndexFlatIP(dim) |
| _rag_index.add(_rag_embeddings.astype(np.float32)) |
|
|
| return True, f"Index built: {len(PAPER_CORPUS)} papers, {dim}-dim embeddings" |
|
|
|
|
| def _confidence_flag(score: float, n_results: int) -> str: |
| """Assign confidence based on retrieval score.""" |
| if score >= 0.55 and n_results >= 2: |
| return "🟢 HIGH" |
| elif score >= 0.35: |
| return "🟡 MEDIUM" |
| else: |
| return "🔴 SPECULATIVE" |
|
|
|
|
| def rag_query(question: str, top_k: int = 3) -> str: |
| """Query the RAG index and return a grounded answer.""" |
| global _rag_index, _rag_model |
|
|
| if _rag_index is None: |
| ok, msg = _build_index() |
| if not ok: |
| return f"⚠️ RAG system unavailable: {msg}" |
|
|
| try: |
| from sentence_transformers import SentenceTransformer |
| import faiss |
| except ImportError: |
| return "⚠️ Required packages not installed: `pip install sentence-transformers faiss-cpu`" |
|
|
| |
| q_emb = _rag_model.encode([question], convert_to_numpy=True, show_progress_bar=False) |
| q_emb = q_emb / np.linalg.norm(q_emb, axis=1, keepdims=True) |
|
|
| |
| scores, indices = _rag_index.search(q_emb.astype(np.float32), top_k) |
| scores = scores[0] |
| indices = indices[0] |
|
|
| |
| MIN_SCORE = 0.20 |
| valid = [(s, i) for s, i in zip(scores, indices) if s >= MIN_SCORE and i >= 0] |
|
|
| if not valid: |
| return ( |
| "❌ **No relevant information found in the indexed papers.**\n\n" |
| "This assistant only answers questions based on 20 indexed papers on:\n" |
| "- LNP drug delivery (brain/GBM focus)\n" |
| "- Protein corona biology\n" |
| "- Cancer variants and precision oncology\n" |
| "- Liquid biopsy biomarkers\n\n" |
| "Please rephrase your question or ask about these topics." |
| ) |
|
|
| top_score = valid[0][0] |
| confidence = _confidence_flag(top_score, len(valid)) |
|
|
| |
| answer_parts = [f"**Confidence: {confidence}** (retrieval score: {top_score:.3f})\n"] |
|
|
| for rank, (score, idx) in enumerate(valid, 1): |
| paper = PAPER_CORPUS[idx] |
| answer_parts.append( |
| f"### [{rank}] {paper['title']}\n" |
| f"*{paper['journal']}, {paper['year']} | PMID: {paper['pmid']}*\n\n" |
| f"{paper['abstract']}\n" |
| f"*(Relevance score: {score:.3f})*" |
| ) |
|
|
| answer_parts.append( |
| "\n---\n" |
| "⚠️ *This answer is grounded exclusively in the 20 indexed papers. " |
| "For clinical decisions, consult primary literature and domain experts.*" |
| ) |
|
|
| return "\n\n".join(answer_parts) |
|
|
|
|
| |
| |
| |
|
|
| def build_chatbot_tab(): |
| """Called from app.py to inject the chatbot into Tab A6.""" |
|
|
| gr.Markdown( |
| "**Status:** Model loads on first query (~30s)...\n\n" |
| "Ask questions about LNP delivery, protein corona, cancer variants, or liquid biopsy. " |
| "Answers are grounded in 20 indexed papers — never fabricated." |
| ) |
|
|
| with gr.Row(): |
| with gr.Column(scale=3): |
| chatbox = gr.Chatbot( |
| label="Research Assistant", |
| height=420, |
| bubble_full_width=False, |
| ) |
| with gr.Row(): |
| user_input = gr.Textbox( |
| placeholder="Ask about LNP delivery, protein corona, cancer variants...", |
| label="Your question", |
| lines=2, |
| scale=4, |
| ) |
| send_btn = gr.Button("Send", variant="primary", scale=1) |
| clear_btn = gr.Button("🗑️ Clear conversation", size="sm") |
|
|
| with gr.Column(scale=1): |
| gr.Markdown("### 📚 Indexed Topics") |
| gr.Markdown( |
| "**LNP Delivery**\n" |
| "- mRNA-LNP formulation\n" |
| "- Ionizable lipids & pKa\n" |
| "- Brain/GBM delivery\n" |
| "- Organ selectivity (SORT)\n" |
| "- PEG & anti-PEG immunity\n\n" |
| "**Protein Corona**\n" |
| "- Hard vs soft corona\n" |
| "- Vroman effect kinetics\n" |
| "- ApoE/LDLR targeting\n\n" |
| "**Cancer Variants**\n" |
| "- TP53 mutation spectrum\n" |
| "- KRAS G12C resistance\n" |
| "- ClinVar classification\n\n" |
| "**Liquid Biopsy**\n" |
| "- ctDNA methylation\n" |
| "- cfRNA biomarkers" |
| ) |
| gr.Markdown( |
| "### 🔑 Confidence Flags\n" |
| "🟢 **HIGH** — strong match (≥0.55)\n" |
| "🟡 **MEDIUM** — moderate match (0.35–0.55)\n" |
| "🔴 **SPECULATIVE** — weak match (<0.35)\n\n" |
| "*Only answers from indexed papers are shown.*" |
| ) |
|
|
| def respond(message, history): |
| if not message.strip(): |
| return history, "" |
| answer = rag_query(message.strip()) |
| history = history or [] |
| history.append((message, answer)) |
| return history, "" |
|
|
| send_btn.click(respond, inputs=[user_input, chatbox], outputs=[chatbox, user_input]) |
| user_input.submit(respond, inputs=[user_input, chatbox], outputs=[chatbox, user_input]) |
| clear_btn.click(lambda: ([], ""), outputs=[chatbox, user_input]) |
|
|
| |
| |
| |
|
|
| if __name__ == "__main__": |
| print("Building RAG index...") |
| ok, msg = _build_index() |
| print(msg) |
|
|
| with gr.Blocks(title="K R&D Lab — Research Assistant") as demo: |
| gr.Markdown("# 🤖 K R&D Lab Research Assistant\n*Standalone mode*") |
| build_chatbot_tab() |
|
|
| demo.launch(share=False) |
|
|