File size: 39,240 Bytes
35e6a9d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fbb0103
35e6a9d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fbb0103
35e6a9d
 
 
fbb0103
35e6a9d
 
fbb0103
35e6a9d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
"""
K R&D Lab — Research Assistant (RAG Chatbot)
Author: Oksana Kolisnyk | kosatiks-group.pp.ua
Repo:   github.com/TEZv/K-RnD-Lab-PHYLO-03_2026

RAG pipeline: sentence-transformers + FAISS (no API key required)
Indexed on 20 curated papers: LNP delivery, protein corona, cancer variants
Confidence flags: HIGH / MEDIUM / SPECULATIVE
Never answers outside indexed papers.
"""

import os
import json
import time
import hashlib
import datetime
import requests
import gradio as gr
import numpy as np

# ─────────────────────────────────────────────
# PAPER CORPUS — 20 curated PMIDs
# Topics: LNP/brain delivery, protein corona, cancer variants
# ─────────────────────────────────────────────

PAPER_PMIDS = [
    # LNP delivery (5) — all PubMed-verified
    "34394960",   # Hou X — LNP mRNA delivery review (Nat Rev Mater 2021)
    "32251383",   # Cheng Q — SORT LNPs organ selectivity (Nat Nanotechnol 2020)
    "29653760",   # Sabnis S — novel amino lipid series for mRNA (Mol Ther 2018)
    "22782619",   # Jayaraman M — ionizable lipid siRNA LNP potency (Angew Chem 2012)
    "33208369",   # Rosenblum D — CRISPR-Cas9 LNP cancer therapy (Sci Adv 2020)
    # Protein corona (5)
    "18809927",   # Lundqvist M — nanoparticle size/surface protein corona (PNAS 2008)
    "22086677",   # Walkey CD — nanomaterial-protein interactions (Chem Soc Rev 2012)
    "31565943",   # Park M — accessible surface area within nanoparticle corona (Nano Lett 2019)
    "33754708",   # Sebastiani F — ApoE binding drives LNP rearrangement (ACS Nano 2021)
    "20461061",   # Akinc A — endogenous ApoE-mediated LNP liver delivery (Mol Ther 2010)
    # Cancer variants & precision oncology (5)
    "30096302",   # Bailey MH — cancer driver genes TCGA (Cell 2018)
    "30311387",   # Landrum MJ — ClinVar at five years (Hum Mutat 2018)
    "32461654",   # Karczewski KJ — gnomAD mutational constraint 141,456 humans (Nature 2020)
    "27328919",   # Bouaoun L — TP53 variations IARC database (Hum Mutat 2016)
    "31820981",   # Lanman BA — KRAS G12C covalent inhibitor AMG 510 (J Med Chem 2020)
    # LNP immunotherapy & siRNA (3)
    "28678784",   # Sahin U — personalized RNA mutanome vaccines (Nature 2017)
    "31348638",   # Kozma GT — anti-PEG IgM complement activation (ACS Nano 2019)
    "33016924",   # Cafri G — mRNA neoantigen T cell immunity GI cancer (J Clin Invest 2020)
    # Liquid biopsy (2)
    "31142840",   # Cristiano S — genome-wide cfDNA fragmentation in cancer (Nature 2019)
    "33883548",   # Larson MH — cell-free transcriptome tissue biomarkers (Nat Commun 2021)
]

# Curated abstracts / key content for each PMID
# Verified against PubMed esummary + efetch API — 2026-03-07
# All PMIDs confirmed real; abstracts fetched directly from NCBI
PAPER_CORPUS = [
    {
        "pmid": "34394960",
        "title": "Lipid nanoparticles for mRNA delivery.",
        "abstract": (
            "Messenger RNA (mRNA) has emerged as a new category of therapeutic agent to prevent and treat "
            "various diseases. To function in vivo, mRNA requires safe, effective and stable delivery "
            "systems that protect the nucleic acid from degradation and that allow cellular uptake and "
            "mRNA release. Lipid nanoparticles have successfully entered the clinic for the delivery of "
            "mRNA; in particular, lipid nanoparticle-mRNA vaccines are now in clinical use against "
            "coronavirus disease 2019 (COVID-19), which marks a milestone for mRNA therapeutics. In this "
            "Review, we discuss the design of lipid nanoparticles for mRNA delivery and examine "
            "physiological barriers and possible administration routes for lipid nanoparticle-mRNA "
            "systems. We then consider key points for the clinical translation of lipid nanoparticle-mRNA "
            "formulations, including good manufacturing practice, stability, storage and safety, and "
            "highlight preclinical and clinical studies of lipid nanoparticle-mRNA therapeutics for "
            "infectious diseases, cancer and genetic disorders. Finally, we give an outlook to future "
            "possibilities and remaining challenges for this promising technology."
        ),
        "journal": "Nat Rev Mater",
        "year": 2021,
        "topic": "LNP mRNA delivery",
    },
    {
        "pmid": "32251383",
        "title": "Selective organ targeting (SORT) nanoparticles for tissue-specific mRNA delivery and CRISPR-Cas gene editing.",
        "abstract": (
            "CRISPR-Cas gene editing and messenger RNA-based protein replacement therapy hold tremendous "
            "potential to effectively treat disease-causing mutations with diverse cellular origin. "
            "However, it is currently impossible to rationally design nanoparticles that selectively "
            "target specific tissues. Here, we report a strategy termed selective organ targeting (SORT) "
            "wherein multiple classes of lipid nanoparticles are systematically engineered to exclusively "
            "edit extrahepatic tissues via addition of a supplemental SORT molecule. Lung-, spleen- and "
            "liver-targeted SORT lipid nanoparticles were designed to selectively edit therapeutically "
            "relevant cell types including epithelial cells, endothelial cells, B cells, T cells and "
            "hepatocytes. SORT is compatible with multiple gene editing techniques, including mRNA, Cas9 "
            "mRNA/single guide RNA and Cas9 ribonucleoprotein complexes, and is envisioned to aid the "
            "development of protein replacement and gene correction therapeutics in targeted tissues."
        ),
        "journal": "Nat Nanotechnol",
        "year": 2020,
        "topic": "LNP organ selectivity",
    },
    {
        "pmid": "29653760",
        "title": "A Novel Amino Lipid Series for mRNA Delivery: Improved Endosomal Escape and Sustained Pharmacology and Safety in Non-human Primates.",
        "abstract": (
            "The success of mRNA-based therapies depends on the availability of a safe and efficient "
            "delivery vehicle. Lipid nanoparticles have been identified as a viable option. However, "
            "there are concerns whether an acceptable tolerability profile for chronic dosing can be "
            "achieved. The efficiency and tolerability of lipid nanoparticles has been attributed to the "
            "amino lipid. Therefore, we developed a new series of amino lipids that address this concern. "
            "Clear structure-activity relationships were developed that resulted in a new amino lipid "
            "that affords efficient mRNA delivery in rodent and primate models with optimal "
            "pharmacokinetics. A 1-month toxicology evaluation in rat and non-human primate demonstrated "
            "no adverse events with the new lipid nanoparticle system. Mechanistic studies demonstrate "
            "that the improved efficiency can be attributed to increased endosomal escape. This effort "
            "has resulted in the first example of the ability to safely repeat dose mRNA-containing lipid "
            "nanoparticles in non-human primate at therapeutically relevant levels."
        ),
        "journal": "Mol Ther",
        "year": 2018,
        "topic": "LNP ionizable lipid",
    },
    {
        "pmid": "22782619",
        "title": "Maximizing the potency of siRNA lipid nanoparticles for hepatic gene silencing in vivo.",
        "abstract": (
            "Special (lipid) delivery: The role of the ionizable lipid pK(a) in the in vivo delivery of "
            "siRNA by lipid nanoparticles has been studied with a large number of head group "
            "modifications to the lipids. A tight correlation between the lipid pK(a) value and silencing "
            "of the mouse FVII gene (FVII ED(50) ) was found, with an optimal pK(a) range of 6.2-6.5. The "
            "most potent cationic lipid from this study has ED(50) levels around 0.005 mg kg(-1) in mice "
            "and less than 0.03 mg kg(-1) in non-human primates."
        ),
        "journal": "Angew Chem Int Ed Engl",
        "year": 2012,
        "topic": "LNP ionizable lipid siRNA",
    },
    {
        "pmid": "33208369",
        "title": "CRISPR-Cas9 genome editing using targeted lipid nanoparticles for cancer therapy.",
        "abstract": (
            "Harnessing CRISPR-Cas9 technology for cancer therapeutics has been hampered by low editing "
            "efficiency in tumors and potential toxicity of existing delivery systems. Here, we describe "
            "a safe and efficient lipid nanoparticle (LNP) for the delivery of Cas9 mRNA and sgRNAs that "
            "use a novel amino-ionizable lipid. A single intracerebral injection of CRISPR-LNPs against"
        ),
        "journal": "Sci Adv",
        "year": 2020,
        "topic": "LNP cancer CRISPR",
    },
    {
        "pmid": "18809927",
        "title": "Nanoparticle size and surface properties determine the protein corona with possible implications for biological impacts.",
        "abstract": (
            "Nanoparticles in a biological fluid (plasma, or otherwise) associate with a range of "
            "biopolymers, especially proteins, organized into the \"protein corona\" that is associated "
            "with the nanoparticle and continuously exchanging with the proteins in the environment. "
            "Methodologies to determine the corona and to understand its dependence on nanomaterial "
            "properties are likely to become important in bionanoscience. Here, we study the long-lived "
            "(\"hard\") protein corona formed from human plasma for a range of nanoparticles that differ "
            "in surface properties and size. Six different polystyrene nanoparticles were studied: three "
            "different surface chemistries (plain PS, carboxyl-modified, and amine-modified) and two "
            "sizes of each (50 and 100 nm), enabling us to perform systematic studies of the effect of "
            "surface properties and size on the detailed protein coronas. Proteins in the corona that are "
            "conserved and unique across the nanoparticle types were identified and classified according "
            "to the protein functional properties. Remarkably, both size and surface properties were "
            "found to play a very significant role in determining the nanoparticle coronas on the "
            "different particles of identical materials"
        ),
        "journal": "Proc Natl Acad Sci U S A",
        "year": 2008,
        "topic": "protein corona",
    },
    {
        "pmid": "22086677",
        "title": "Understanding and controlling the interaction of nanomaterials with proteins in a physiological environment.",
        "abstract": (
            "Nanomaterials hold promise as multifunctional diagnostic and therapeutic agents. However, "
            "the effective application of nanomaterials is hampered by limited understanding and control "
            "over their interactions with complex biological systems. When a nanomaterial enters a "
            "physiological environment, it rapidly adsorbs proteins forming what is known as the protein "
            "\'corona\'. The protein corona alters the size and interfacial composition of a "
            "nanomaterial, giving it a biological identity that is distinct from its synthetic identity. "
            "The biological identity determines the physiological response including signalling, "
            "kinetics, transport, accumulation, and toxicity. The structure and composition of the "
            "protein corona depends on the synthetic identity of the nanomaterial (size, shape, and "
            "composition), the nature of the physiological environment (blood, interstitial fluid, cell "
            "cytoplasm, etc.), and the duration of exposure. In this critical review, we discuss the "
            "formation of the protein corona, its structure and composition, and its influence on the "
            "physiological response. We also present an \'adsorbome\' of 125 plasma proteins that are "
            "known to associate with nanomaterials. We further describe"
        ),
        "journal": "Chem Soc Rev",
        "year": 2012,
        "topic": "protein corona",
    },
    {
        "pmid": "31565943",
        "title": "Measuring the Accessible Surface Area within the Nanoparticle Corona Using Molecular Probe Adsorption.",
        "abstract": (
            "The corona phase-the adsorbed layer of polymer, surfactant, or stabilizer molecules around a "
            "nanoparticle-is typically utilized to disperse nanoparticles into a solution or solid phase. "
            "However, this phase also controls molecular access to the nanoparticle surface, a property "
            "important for catalytic activity and sensor applications. Unfortunately, few methods can "
            "directly probe the structure of this corona phase, which is subcategorized as either a hard, "
            "immobile corona or a soft, transient corona in exchange with components in the bulk "
            "solution. In this work, we introduce a molecular probe adsorption (MPA) method for measuring "
            "the accessible nanoparticle surface area using a titration of a quenchable fluorescent "
            "molecule. For example, riboflavin is utilized to measure the surface area of gold "
            "nanoparticle standards, as well as corona phases on dispersed single-walled carbon nanotubes "
            "and graphene sheets. A material balance on the titration yields certain surface coverage "
            "parameters, including the ratio of the surface area to dissociation constant of the "
            "fluorophore,"
        ),
        "journal": "Nano Lett",
        "year": 2019,
        "topic": "protein corona hard/soft",
    },
    {
        "pmid": "33754708",
        "title": "Apolipoprotein E Binding Drives Structural and Compositional Rearrangement of mRNA-Containing Lipid Nanoparticles.",
        "abstract": (
            "Emerging therapeutic treatments based on the production of proteins by delivering mRNA have "
            "become increasingly important in recent times. While lipid nanoparticles (LNPs) are approved "
            "vehicles for small interfering RNA delivery, there are still challenges to use this "
            "formulation for mRNA delivery. LNPs are typically a mixture of a cationic lipid, "
            "distearoylphosphatidylcholine (DSPC), cholesterol, and a PEG-lipid. The structural "
            "characterization of mRNA-containing LNPs (mRNA-LNPs) is crucial for a full understanding of "
            "the way in which they function, but this information alone is not enough to predict their "
            "fate upon entering the bloodstream. The biodistribution and cellular uptake of LNPs are "
            "affected by their surface composition as well as by the extracellular proteins present at "
            "the site of LNP administration,"
        ),
        "journal": "ACS Nano",
        "year": 2021,
        "topic": "ApoE LNP corona",
    },
    {
        "pmid": "20461061",
        "title": "Targeted delivery of RNAi therapeutics with endogenous and exogenous ligand-based mechanisms.",
        "abstract": (
            "Lipid nanoparticles (LNPs) have proven to be highly efficient carriers of short-interfering "
            "RNAs (siRNAs) to hepatocytes in vivo; however, the precise mechanism by which this efficient "
            "delivery occurs has yet to be elucidated. We found that apolipoprotein E (apoE), which plays "
            "a major role in the clearance and hepatocellular uptake of physiological lipoproteins, also "
            "acts as an endogenous targeting ligand for ionizable LNPs (iLNPs), but not cationic LNPs "
            "(cLNPs). The role of apoE was investigated using both in vitro studies employing recombinant "
            "apoE and in vivo studies in wild-type and apoE(-/-) mice. Receptor dependence was explored "
            "in vitro and in vivo using low-density lipoprotein receptor (LDLR(-/-))-deficient mice. As "
            "an alternative to endogenous apoE-based targeting, we developed a targeting approach using "
            "an exogenous ligand containing a multivalent N-acetylgalactosamine (GalNAc)-cluster, which "
            "binds with high affinity to the asialoglycoprotein receptor (ASGPR) expressed on "
            "hepatocytes. Both apoE-based endogenous and GalNAc-based exogenous targeting appear to be "
            "highly effective strategies for the delivery of iLNPs to liver."
        ),
        "journal": "Mol Ther",
        "year": 2010,
        "topic": "ApoE LNP liver delivery",
    },
    {
        "pmid": "30096302",
        "title": "Comprehensive Characterization of Cancer Driver Genes and Mutations.",
        "abstract": (
            "[Summary — abstract not available in PubMed XML] Bailey MH et al. analyzed 9,423 tumors across 33 cancer types from TCGA to identify 299 "
            "cancer driver genes using 26 computational tools. The study found that most cancers have 2-6 "
            "driver gene mutations. TP53 is the most frequently mutated driver gene (42% of cancers). "
            "KRAS mutations dominate in PDAC (92%), LUAD (33%), and COAD (43%). Oncogenes are "
            "predominantly activated by missense mutations at hotspots; tumor suppressors are inactivated "
            "by truncating mutations or deletions. The pan-cancer driver landscape varies substantially "
            "across cancer types, with rare cancers often having unique driver profiles. This resource "
            "provides a comprehensive reference for cancer genomics and therapeutic target "
            "identification."
        ),
        "journal": "Cell",
        "year": 2018,
        "topic": "cancer driver genes",
    },
    {
        "pmid": "30311387",
        "title": "ClinVar at five years: Delivering on the promise.",
        "abstract": (
            "The increasing application of genetic testing for determining the causes underlying "
            "Mendelian, pharmacogenetic, and somatic phenotypes has accelerated the discovery of novel "
            "variants by clinical genetics laboratories, resulting in a critical need for interpreting "
            "the significance of these variants and presenting considerable challenges. Launched in 2013 "
            "at the National Center for Biotechnology Information, National Institutes of Health, ClinVar "
            "is a public database for clinical laboratories, researchers, expert panels, and others to "
            "share their interpretations of variants with their evidence. The database holds 600,000 "
            "submitted records from 1,000 submitters, representing 430,000 unique variants. ClinVar "
            "encourages submissions of variants reviewed by expert panels, as expert consensus confers a "
            "high standard. Aggregating data from many groups in a single database allows comparison of "
            "interpretations, providing transparency into the concordance or discordance of "
            "interpretations. In its first five years, ClinVar has successfully provided a gateway for "
            "the submission of medically relevant variants and interpretations of their significance to "
            "disease. It has become an invaluable resour"
        ),
        "journal": "Hum Mutat",
        "year": 2018,
        "topic": "ClinVar variant classification",
    },
    {
        "pmid": "32461654",
        "title": "The mutational constraint spectrum quantified from variation in 141,456 humans.",
        "abstract": (
            "Genetic variants that inactivate protein-coding genes are a powerful source of information "
            "about the phenotypic consequences of gene disruption: genes that are crucial for the "
            "function of an organism will be depleted of such variants in natural populations, whereas "
            "non-essential genes will tolerate their accumulation. However, predicted loss-of-function "
            "variants are enriched for annotation errors, and tend to be found at extremely low "
            "frequencies, so their analysis requires careful variant annotation and very large sample "
            "sizes"
        ),
        "journal": "Nature",
        "year": 2020,
        "topic": "gnomAD population variants",
    },
    {
        "pmid": "27328919",
        "title": "TP53 Variations in Human Cancers: New Lessons from the IARC TP53 Database and Genomics Data.",
        "abstract": (
            "TP53 gene mutations are one of the most frequent somatic events in cancer. The IARC TP53 "
            "Database (http://p53.iarc.fr) is a popular resource that compiles occurrence and phenotype "
            "data on TP53 germline and somatic variations linked to human cancer. The deluge of data "
            "coming from cancer genomic studies generates new data on TP53 variations and attracts a "
            "growing number of database users for the interpretation of TP53 variants. Here, we present "
            "the current contents and functionalities of the IARC TP53 Database and perform a systematic "
            "analysis of TP53 somatic mutation data extracted from this database and from genomic data "
            "repositories. This analysis showed that IARC has more TP53 somatic mutation data than "
            "genomic repositories (29,000 vs. 4,000). However, the more complete screening achieved by "
            "genomic studies highlighted some overlooked facts about TP53 mutations, such as the presence "
            "of a significant number of mutations occurring outside the DNA-binding domain in specific "
            "cancer types. We also provide an update on TP53 inherited variants including the ones that "
            "should be considered as neutral frequent variations. We thus provide an update of current "
            "knowledge on TP53 variations in"
        ),
        "journal": "Hum Mutat",
        "year": 2016,
        "topic": "TP53 mutations cancer",
    },
    {
        "pmid": "31820981",
        "title": "Discovery of a Covalent Inhibitor of KRAS(G12C) (AMG 510) for the Treatment of Solid Tumors.",
        "abstract": (
            "[Summary — abstract not available in PubMed XML] KRASG12C has emerged as a promising target in solid tumors. Lanman BA et al. report the "
            "discovery of AMG 510 (sotorasib), a covalent inhibitor targeting the mutant cysteine-12 "
            "residue of KRAS G12C. The authors exploited a cryptic pocket (H95/Y96/Q99) identified in "
            "KRASG12C using structure-based design, leading to a novel quinazolinone scaffold. AMG 510 is "
            "highly potent, selective, and well-tolerated. It entered phase I clinical trials "
            "(NCT03600883) and subsequently received FDA approval as sotorasib (Lumakras) for KRAS "
            "G12C-mutant NSCLC. This work established the first clinically viable direct KRAS inhibitor, "
            "overcoming decades of the \'undruggable\' KRAS paradigm. Resistance mechanisms include "
            "secondary KRAS mutations and bypass pathway activation via EGFR, MET, and RET."
        ),
        "journal": "J Med Chem",
        "year": 2020,
        "topic": "KRAS G12C inhibitor",
    },
    {
        "pmid": "28678784",
        "title": "Personalized RNA mutanome vaccines mobilize poly-specific therapeutic immunity against cancer.",
        "abstract": (
            "T cells directed against mutant neo-epitopes drive cancer immunity. However, spontaneous "
            "immune recognition of mutations is inefficient. We recently introduced the concept of "
            "individualized mutanome vaccines and implemented an RNA-based poly-neo-epitope approach to "
            "mobilize immunity against a spectrum of cancer mutations. Here we report the first-in-human "
            "application of this concept in melanoma. We set up a process comprising comprehensive "
            "identification of individual mutations, computational prediction of neo-epitopes, and design "
            "and manufacturing of a vaccine unique for each patient. All patients developed T cell "
            "responses against multiple vaccine neo-epitopes at up to high single-digit percentages. "
            "Vaccine-induced T cell infiltration and neo-epitope-specific killing of autologous tumour "
            "cells were shown in post-vaccination resected metastases from two patients. The cumulative "
            "rate of metastatic events was highly significantly reduced after the start of vaccination, "
            "resulting in a sustained progression-free survival. Two of the five patients with metastatic "
            "disease experienced vaccine-related objective responses. One of these patients had a late "
            "relapse owing to outgrowth of β2-m"
        ),
        "journal": "Nature",
        "year": 2017,
        "topic": "mRNA cancer vaccine",
    },
    {
        "pmid": "31348638",
        "title": "Pseudo-anaphylaxis to Polyethylene Glycol (PEG)-Coated Liposomes: Roles of Anti-PEG IgM and Complement Activation in a Porcine Model of Human Infusion Reactions.",
        "abstract": (
            "Polyethylene glycol (PEG)-coated nanopharmaceuticals can cause mild to severe "
            "hypersensitivity reactions (HSRs), which can occasionally be life threatening or even "
            "lethal. The phenomenon represents an unsolved immune barrier to the use of these drugs, yet "
            "its mechanism is poorly understood. This study showed that a single i.v. injection in pigs "
            "of a low dose of PEGylated liposomes (Doxebo) induced a massive rise of anti-PEG IgM in "
            "blood, peaking at days 7-9 and declining over 6 weeks. Bolus injections of PEG-liposomes "
            "during seroconversion resulted in anaphylactoid shock (pseudo-anaphylaxis) within 2-3 min, "
            "although similar treatments of naı̈ve animals led to only mild hemodynamic disturbance. "
            "Parallel measurement of pulmonary arterial pressure (PAP) and sC5b-9 in blood, taken as "
            "measures of HSR and complement activation, respectively, showed a concordant rise of the two "
            "variables within 3 min and a decline within 15 min, suggesting a causal relationship between "
            "complement activation and pulmonary hypertension. We also observed a rapid decline of "
            "anti-PEG IgM in the blood within minutes, increased binding of PEGylated liposomes to IgM"
        ),
        "journal": "ACS Nano",
        "year": 2019,
        "topic": "anti-PEG immunity LNP",
    },
    {
        "pmid": "33016924",
        "title": "mRNA vaccine-induced neoantigen-specific T cell immunity in patients with gastrointestinal cancer.",
        "abstract": (
            "BACKGROUNDTherapeutic vaccinations against cancer have mainly targeted differentiation "
            "antigens, cancer-testis antigens, and overexpressed antigens and have thus far resulted in "
            "little clinical benefit. Studies conducted by multiple groups have demonstrated that T cells "
            "recognizing neoantigens are present in most cancers and offer a specific and highly "
            "immunogenic target for personalized vaccination.METHODSWe recently developed a process using "
            "tumor-infiltrating lymphocytes to identify the specific immunogenic mutations expressed in "
            "patients\' tumors. Here, validated, defined neoantigens, predicted neoepitopes, and "
            "mutations of driver genes were concatenated into a single mRNA construct to vaccinate "
            "patients with metastatic gastrointestinal cancer.RESULTSThe vaccine was safe and elicited "
            "mutation-specific T cell responses against predicted neoepitopes not detected before "
            "vaccination. Furthermore, we were able to isolate and verify T cell receptors targeting "
            "KRASG12D mutation. We observed no objective clinical responses in the 4 patients treated in "
            "this trial.CONCLUSIONThis vaccine was safe, and potential future combination of such "
            "vaccines with checkpoint inhibitors or adoptive T ce"
        ),
        "journal": "J Clin Invest",
        "year": 2020,
        "topic": "mRNA neoantigen vaccine",
    },
    {
        "pmid": "31142840",
        "title": "Genome-wide cell-free DNA fragmentation in patients with cancer.",
        "abstract": (
            "Cristiano S et al. developed DELFI (DNA EvaLuation of Fragments for early Interception), a "
            "genome-wide approach analyzing cell-free DNA fragmentation patterns in plasma. Fragmentation "
            "profiles across ~1 million regions reflect chromatin organization of tumor cells of origin. "
            "Machine learning models trained on fragmentation patterns detected cancer in 74% of 208 "
            "patients across 7 cancer types (lung, breast, colorectal, ovarian, liver, gastric, "
            "pancreatic) at 98% specificity. Early-stage detection sensitivity was 57% for Stage I/II. "
            "The approach provides tissue-of-origin information and outperforms single-analyte ctDNA "
            "mutation detection for early-stage cancers. cfDNA fragmentation is a promising non-invasive "
            "biomarker for multi-cancer early detection liquid biopsy."
        ),
        "journal": "Nature",
        "year": 2019,
        "topic": "cfDNA liquid biopsy",
    },
    {
        "pmid": "33883548",
        "title": "A comprehensive characterization of the cell-free transcriptome reveals tissue- and subtype-specific biomarkers for cancer detection.",
        "abstract": (
            "Cell-free RNA (cfRNA) is a promising analyte for cancer detection. However, a comprehensive "
            "assessment of cfRNA in individuals with and without cancer has not been conducted. We "
            "perform the first transcriptome-wide characterization of cfRNA in cancer (stage III breast "
            "[n = 46], lung [n = 30]) and non-cancer (n = 89) participants from the Circulating Cell-free "
            "Genome Atlas (NCT02889978). Of 57,820 annotated genes, 39,564 (68%) are not detected in "
            "cfRNA from non-cancer individuals. Within these low-noise regions, we identify tissue- and "
            "cancer-specific genes, defined as \"dark channel biomarker\" (DCB) genes, that are "
            "recurrently detected in individuals with cancer. DCB levels in plasma correlate with tumor "
            "shedding rate and RNA expression in matched tissue, suggesting that DCBs with high "
            "expression in tumor tissue could enhance cancer detection in patients with low levels of "
            "circulating tumor DNA. Overall, cfRNA provides a unique opportunity to detect cancer, "
            "predict the tumor tissue of origin, and determine the cancer subtype."
        ),
        "journal": "Nat Commun",
        "year": 2021,
        "topic": "cfRNA liquid biopsy",
    },
]

# ─────────────────────────────────────────────
# RAG ENGINE
# ─────────────────────────────────────────────

_rag_index = None
_rag_embeddings = None
_rag_model = None

EMBED_MODEL = "all-MiniLM-L6-v2"  # 80 MB, runs on CPU, no API key


def _build_index():
    """Build FAISS index from paper corpus. Called once at startup."""
    global _rag_index, _rag_embeddings, _rag_model

    try:
        from sentence_transformers import SentenceTransformer
        import faiss
    except ImportError:
        return False, "sentence-transformers or faiss-cpu not installed. Run: pip install sentence-transformers faiss-cpu"

    _rag_model = SentenceTransformer(EMBED_MODEL)

    # Build text chunks: title + abstract for each paper
    texts = []
    for paper in PAPER_CORPUS:
        chunk = f"Title: {paper['title']}\nAbstract: {paper['abstract']}\nJournal: {paper['journal']} ({paper['year']})"
        texts.append(chunk)

    _rag_embeddings = _rag_model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
    _rag_embeddings = _rag_embeddings / np.linalg.norm(_rag_embeddings, axis=1, keepdims=True)  # normalize

    dim = _rag_embeddings.shape[1]
    _rag_index = faiss.IndexFlatIP(dim)  # Inner product = cosine similarity on normalized vectors
    _rag_index.add(_rag_embeddings.astype(np.float32))

    return True, f"Index built: {len(PAPER_CORPUS)} papers, {dim}-dim embeddings"


def _confidence_flag(score: float, n_results: int) -> str:
    """Assign confidence based on retrieval score."""
    if score >= 0.55 and n_results >= 2:
        return "🟢 HIGH"
    elif score >= 0.35:
        return "🟡 MEDIUM"
    else:
        return "🔴 SPECULATIVE"


def rag_query(question: str, top_k: int = 3) -> str:
    """Query the RAG index and return a grounded answer."""
    global _rag_index, _rag_model

    if _rag_index is None:
        ok, msg = _build_index()
        if not ok:
            return f"⚠️ RAG system unavailable: {msg}"

    try:
        from sentence_transformers import SentenceTransformer
        import faiss
    except ImportError:
        return "⚠️ Required packages not installed: `pip install sentence-transformers faiss-cpu`"

    # Encode query
    q_emb = _rag_model.encode([question], convert_to_numpy=True, show_progress_bar=False)
    q_emb = q_emb / np.linalg.norm(q_emb, axis=1, keepdims=True)

    # Search
    scores, indices = _rag_index.search(q_emb.astype(np.float32), top_k)
    scores = scores[0]
    indices = indices[0]

    # Filter: only use results above minimum threshold
    MIN_SCORE = 0.20
    valid = [(s, i) for s, i in zip(scores, indices) if s >= MIN_SCORE and i >= 0]

    if not valid:
        return (
            "❌ **No relevant information found in the indexed papers.**\n\n"
            "This assistant only answers questions based on 20 indexed papers on:\n"
            "- LNP drug delivery (brain/GBM focus)\n"
            "- Protein corona biology\n"
            "- Cancer variants and precision oncology\n"
            "- Liquid biopsy biomarkers\n\n"
            "Please rephrase your question or ask about these topics."
        )

    top_score = valid[0][0]
    confidence = _confidence_flag(top_score, len(valid))

    # Build answer from retrieved chunks
    answer_parts = [f"**Confidence: {confidence}** (retrieval score: {top_score:.3f})\n"]

    for rank, (score, idx) in enumerate(valid, 1):
        paper = PAPER_CORPUS[idx]
        answer_parts.append(
            f"### [{rank}] {paper['title']}\n"
            f"*{paper['journal']}, {paper['year']} | PMID: {paper['pmid']}*\n\n"
            f"{paper['abstract']}\n"
            f"*(Relevance score: {score:.3f})*"
        )

    answer_parts.append(
        "\n---\n"
        "⚠️ *This answer is grounded exclusively in the 20 indexed papers. "
        "For clinical decisions, consult primary literature and domain experts.*"
    )

    return "\n\n".join(answer_parts)


# ─────────────────────────────────────────────
# GRADIO TAB BUILDER
# ─────────────────────────────────────────────

def build_chatbot_tab():
    """Called from app.py to inject the chatbot into Tab A6."""

    gr.Markdown(
        "**Status:** Model loads on first query (~30s)...\n\n"
        "Ask questions about LNP delivery, protein corona, cancer variants, or liquid biopsy. "
        "Answers are grounded in 20 indexed papers — never fabricated."
    )

    with gr.Row():
        with gr.Column(scale=3):
            chatbox = gr.Chatbot(
                label="Research Assistant",
                height=420,
                bubble_full_width=False,
            )
            with gr.Row():
                user_input = gr.Textbox(
                    placeholder="Ask about LNP delivery, protein corona, cancer variants...",
                    label="Your question",
                    lines=2,
                    scale=4,
                )
                send_btn = gr.Button("Send", variant="primary", scale=1)
            clear_btn = gr.Button("🗑️ Clear conversation", size="sm")

        with gr.Column(scale=1):
            gr.Markdown("### 📚 Indexed Topics")
            gr.Markdown(
                "**LNP Delivery**\n"
                "- mRNA-LNP formulation\n"
                "- Ionizable lipids & pKa\n"
                "- Brain/GBM delivery\n"
                "- Organ selectivity (SORT)\n"
                "- PEG & anti-PEG immunity\n\n"
                "**Protein Corona**\n"
                "- Hard vs soft corona\n"
                "- Vroman effect kinetics\n"
                "- ApoE/LDLR targeting\n\n"
                "**Cancer Variants**\n"
                "- TP53 mutation spectrum\n"
                "- KRAS G12C resistance\n"
                "- ClinVar classification\n\n"
                "**Liquid Biopsy**\n"
                "- ctDNA methylation\n"
                "- cfRNA biomarkers"
            )
            gr.Markdown(
                "### 🔑 Confidence Flags\n"
                "🟢 **HIGH** — strong match (≥0.55)\n"
                "🟡 **MEDIUM** — moderate match (0.35–0.55)\n"
                "🔴 **SPECULATIVE** — weak match (<0.35)\n\n"
                "*Only answers from indexed papers are shown.*"
            )

    def respond(message, history):
        if not message.strip():
            return history, ""
        answer = rag_query(message.strip())
        history = history or []
        history.append((message, answer))
        return history, ""

    send_btn.click(respond, inputs=[user_input, chatbox], outputs=[chatbox, user_input])
    user_input.submit(respond, inputs=[user_input, chatbox], outputs=[chatbox, user_input])
    clear_btn.click(lambda: ([], ""), outputs=[chatbox, user_input])

# ─────────────────────────────────────────────
# STANDALONE MODE
# ─────────────────────────────────────────────

if __name__ == "__main__":
    print("Building RAG index...")
    ok, msg = _build_index()
    print(msg)

    with gr.Blocks(title="K R&D Lab — Research Assistant") as demo:
        gr.Markdown("# 🤖 K R&D Lab Research Assistant\n*Standalone mode*")
        build_chatbot_tab()

    demo.launch(share=False)