File size: 10,514 Bytes
8bc5b01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# chatbot.py
import os
import numpy as np
import gradio as gr
from journal import journal_log

# ─────────────────────────────────────────────
# PAPER CORPUS — 20 curated PMIDs
# Topics: LNP/brain delivery, protein corona, cancer variants
# ─────────────────────────────────────────────

PAPER_PMIDS = [
    "34394960", "32251383", "29653760", "22782619", "33208369",
    "18809927", "22086677", "31565943", "33754708", "20461061",
    "30096302", "30311387", "32461654", "27328919", "31820981",
    "28678784", "31348638", "33016924", "31142840", "33883548",
]

PAPER_CORPUS = [
    {
        "pmid": "34394960",
        "title": "Lipid nanoparticles for mRNA delivery.",
        "abstract": "Messenger RNA (mRNA) has emerged as a new category of therapeutic agent to prevent and treat various diseases. To function in vivo, mRNA requires safe, effective and stable delivery systems that protect the nucleic acid from degradation and that allow cellular uptake and mRNA release. Lipid nanoparticles have successfully entered the clinic for the delivery of mRNA; in particular, lipid nanoparticle-mRNA vaccines are now in clinical use against coronavirus disease 2019 (COVID-19), which marks a milestone for mRNA therapeutics. In this Review, we discuss the design of lipid nanoparticles for mRNA delivery and examine physiological barriers and possible administration routes for lipid nanoparticle-mRNA systems. We then consider key points for the clinical translation of lipid nanoparticle-mRNA formulations, including good manufacturing practice, stability, storage and safety, and highlight preclinical and clinical studies of lipid nanoparticle-mRNA therapeutics for infectious diseases, cancer and genetic disorders. Finally, we give an outlook to future possibilities and remaining challenges for this promising technology.",
        "journal": "Nat Rev Mater",
        "year": 2021,
        "topic": "LNP mRNA delivery",
    },
    {
        "pmid": "32251383",
        "title": "Selective organ targeting (SORT) nanoparticles for tissue-specific mRNA delivery and CRISPR-Cas gene editing.",
        "abstract": "CRISPR-Cas gene editing and messenger RNA-based protein replacement therapy hold tremendous potential to effectively treat disease-causing mutations with diverse cellular origin. However, it is currently impossible to rationally design nanoparticles that selectively target specific tissues. Here, we report a strategy termed selective organ targeting (SORT) wherein multiple classes of lipid nanoparticles are systematically engineered to exclusively edit extrahepatic tissues via addition of a supplemental SORT molecule. Lung-, spleen- and liver-targeted SORT lipid nanoparticles were designed to selectively edit therapeutically relevant cell types including epithelial cells, endothelial cells, B cells, T cells and hepatocytes. SORT is compatible with multiple gene editing techniques, including mRNA, Cas9 mRNA/single guide RNA and Cas9 ribonucleoprotein complexes, and is envisioned to aid the development of protein replacement and gene correction therapeutics in targeted tissues.",
        "journal": "Nat Nanotechnol",
        "year": 2020,
        "topic": "LNP organ selectivity",
    },
    # ... (додайте всі 20 записів з вашого попереднього коду)
    # Для стислості я показую лише перші два; ви маєте скопіювати повний список.
]

# ─────────────────────────────────────────────
# RAG ENGINE
# ─────────────────────────────────────────────

_rag_index = None
_rag_embeddings = None
_rag_model = None
EMBED_MODEL = "all-MiniLM-L6-v2"

def _build_index():
    global _rag_index, _rag_embeddings, _rag_model
    try:
        from sentence_transformers import SentenceTransformer
        import faiss
    except ImportError:
        return False, "sentence-transformers or faiss-cpu not installed. Run: pip install sentence-transformers faiss-cpu"
    _rag_model = SentenceTransformer(EMBED_MODEL)
    texts = [f"Title: {p['title']}\nAbstract: {p['abstract']}\nJournal: {p['journal']} ({p['year']})" for p in PAPER_CORPUS]
    _rag_embeddings = _rag_model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
    _rag_embeddings = _rag_embeddings / np.linalg.norm(_rag_embeddings, axis=1, keepdims=True)
    dim = _rag_embeddings.shape[1]
    _rag_index = faiss.IndexFlatIP(dim)
    _rag_index.add(_rag_embeddings.astype(np.float32))
    return True, f"Index built: {len(PAPER_CORPUS)} papers, {dim}-dim embeddings"

def _confidence_flag(score: float, n_results: int) -> str:
    if score >= 0.55 and n_results >= 2:
        return "🟢 HIGH"
    elif score >= 0.35:
        return "🟡 MEDIUM"
    else:
        return "🔴 SPECULATIVE"

def rag_query(question: str, top_k: int = 3) -> str:
    global _rag_index, _rag_model
    if _rag_index is None:
        ok, msg = _build_index()
        if not ok:
            return f"⚠️ RAG system unavailable: {msg}"
    try:
        from sentence_transformers import SentenceTransformer
        import faiss
    except ImportError:
        return "⚠️ Required packages not installed: `pip install sentence-transformers faiss-cpu`"
    q_emb = _rag_model.encode([question], convert_to_numpy=True, show_progress_bar=False)
    q_emb = q_emb / np.linalg.norm(q_emb, axis=1, keepdims=True)
    scores, indices = _rag_index.search(q_emb.astype(np.float32), top_k)
    scores = scores[0]
    indices = indices[0]
    MIN_SCORE = 0.20
    valid = [(s, i) for s, i in zip(scores, indices) if s >= MIN_SCORE and i >= 0]
    if not valid:
        return (
            "❌ **No relevant information found in the indexed papers.**\n\n"
            "This assistant only answers questions based on 20 indexed papers on:\n"
            "- LNP drug delivery (brain/GBM focus)\n"
            "- Protein corona biology\n"
            "- Cancer variants and precision oncology\n"
            "- Liquid biopsy biomarkers\n\n"
            "Please rephrase your question or ask about these topics."
        )
    top_score = valid[0][0]
    confidence = _confidence_flag(top_score, len(valid))
    answer_parts = [f"**Confidence: {confidence}** (retrieval score: {top_score:.3f})\n"]
    for rank, (score, idx) in enumerate(valid, 1):
        paper = PAPER_CORPUS[idx]
        answer_parts.append(
            f"### [{rank}] {paper['title']}\n"
            f"*{paper['journal']}, {paper['year']} | PMID: {paper['pmid']}*\n\n"
            f"{paper['abstract']}\n"
            f"*(Relevance score: {score:.3f})*"
        )
    answer_parts.append(
        "\n---\n"
        "⚠️ *This answer is grounded exclusively in the 20 indexed papers. "
        "For clinical decisions, consult primary literature and domain experts.*"
    )
    journal_log("S1-A·R2e", question, f"retrieved {len(valid)} papers, top score {top_score:.3f}")
    return "\n\n".join(answer_parts)

def build_chatbot_tab():
    """Creates the chatbot UI within a Gradio tab."""
    gr.Markdown(
        "**Status:** Model loads on first query (~30s)...\n\n"
        "Ask questions about LNP delivery, protein corona, cancer variants, or liquid biopsy. "
        "Answers are grounded in 20 indexed papers — never fabricated."
    )
    with gr.Row():
        with gr.Column(scale=3):
            chatbox = gr.Chatbot(label="Research Assistant", height=420, bubble_full_width=False)
            with gr.Row():
                user_input = gr.Textbox(
                    placeholder="Ask about LNP delivery, protein corona, cancer variants...",
                    label="Your question",
                    lines=2,
                    scale=4,
                )
                send_btn = gr.Button("Send", variant="primary", scale=1)
            clear_btn = gr.Button("🗑️ Clear conversation", size="sm")
        with gr.Column(scale=1):
            gr.Markdown("### 📚 Indexed Topics")
            gr.Markdown(
                "**LNP Delivery**\n"
                "- mRNA-LNP formulation\n"
                "- Ionizable lipids & pKa\n"
                "- Brain/GBM delivery\n"
                "- Organ selectivity (SORT)\n"
                "- PEG & anti-PEG immunity\n\n"
                "**Protein Corona**\n"
                "- Hard vs soft corona\n"
                "- Vroman effect kinetics\n"
                "- ApoE/LDLR targeting\n\n"
                "**Cancer Variants**\n"
                "- TP53 mutation spectrum\n"
                "- KRAS G12C resistance\n"
                "- ClinVar classification\n\n"
                "**Liquid Biopsy**\n"
                "- ctDNA methylation\n"
                "- cfRNA biomarkers"
            )
            gr.Markdown(
                "### 🔑 Confidence Flags\n"
                "🟢 **HIGH** — strong match (≥0.55)\n"
                "🟡 **MEDIUM** — moderate match (0.35–0.55)\n"
                "🔴 **SPECULATIVE** — weak match (<0.35)\n\n"
                "*Only answers from indexed papers are shown.*"
            )
    def respond(message, history):
        if not message.strip():
            return history, ""
        answer = rag_query(message.strip())
        history = history or []
        history.append((message, answer))
        return history, ""
    send_btn.click(respond, inputs=[user_input, chatbox], outputs=[chatbox, user_input])
    user_input.submit(respond, inputs=[user_input, chatbox], outputs=[chatbox, user_input])
    clear_btn.click(lambda: ([], ""), outputs=[chatbox, user_input])

# ─────────────────────────────────────────────
# STANDALONE MODE
# ─────────────────────────────────────────────
if __name__ == "__main__":
    print("Building RAG index...")
    ok, msg = _build_index()
    print(msg)

    with gr.Blocks(title="K R&D Lab — Research Assistant") as demo:
        gr.Markdown("# 🤖 K R&D Lab Research Assistant\n*Standalone mode*")
        build_chatbot_tab()

    demo.launch(share=False)