Create chatbot.py
Browse files- chatbot.py +190 -0
chatbot.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# chatbot.py
|
| 2 |
+
import os
|
| 3 |
+
import numpy as np
|
| 4 |
+
import gradio as gr
|
| 5 |
+
from journal import journal_log
|
| 6 |
+
|
| 7 |
+
# ─────────────────────────────────────────────
|
| 8 |
+
# PAPER CORPUS — 20 curated PMIDs
|
| 9 |
+
# Topics: LNP/brain delivery, protein corona, cancer variants
|
| 10 |
+
# ─────────────────────────────────────────────
|
| 11 |
+
|
| 12 |
+
PAPER_PMIDS = [
|
| 13 |
+
"34394960", "32251383", "29653760", "22782619", "33208369",
|
| 14 |
+
"18809927", "22086677", "31565943", "33754708", "20461061",
|
| 15 |
+
"30096302", "30311387", "32461654", "27328919", "31820981",
|
| 16 |
+
"28678784", "31348638", "33016924", "31142840", "33883548",
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
PAPER_CORPUS = [
|
| 20 |
+
{
|
| 21 |
+
"pmid": "34394960",
|
| 22 |
+
"title": "Lipid nanoparticles for mRNA delivery.",
|
| 23 |
+
"abstract": "Messenger RNA (mRNA) has emerged as a new category of therapeutic agent to prevent and treat various diseases. To function in vivo, mRNA requires safe, effective and stable delivery systems that protect the nucleic acid from degradation and that allow cellular uptake and mRNA release. Lipid nanoparticles have successfully entered the clinic for the delivery of mRNA; in particular, lipid nanoparticle-mRNA vaccines are now in clinical use against coronavirus disease 2019 (COVID-19), which marks a milestone for mRNA therapeutics. In this Review, we discuss the design of lipid nanoparticles for mRNA delivery and examine physiological barriers and possible administration routes for lipid nanoparticle-mRNA systems. We then consider key points for the clinical translation of lipid nanoparticle-mRNA formulations, including good manufacturing practice, stability, storage and safety, and highlight preclinical and clinical studies of lipid nanoparticle-mRNA therapeutics for infectious diseases, cancer and genetic disorders. Finally, we give an outlook to future possibilities and remaining challenges for this promising technology.",
|
| 24 |
+
"journal": "Nat Rev Mater",
|
| 25 |
+
"year": 2021,
|
| 26 |
+
"topic": "LNP mRNA delivery",
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"pmid": "32251383",
|
| 30 |
+
"title": "Selective organ targeting (SORT) nanoparticles for tissue-specific mRNA delivery and CRISPR-Cas gene editing.",
|
| 31 |
+
"abstract": "CRISPR-Cas gene editing and messenger RNA-based protein replacement therapy hold tremendous potential to effectively treat disease-causing mutations with diverse cellular origin. However, it is currently impossible to rationally design nanoparticles that selectively target specific tissues. Here, we report a strategy termed selective organ targeting (SORT) wherein multiple classes of lipid nanoparticles are systematically engineered to exclusively edit extrahepatic tissues via addition of a supplemental SORT molecule. Lung-, spleen- and liver-targeted SORT lipid nanoparticles were designed to selectively edit therapeutically relevant cell types including epithelial cells, endothelial cells, B cells, T cells and hepatocytes. SORT is compatible with multiple gene editing techniques, including mRNA, Cas9 mRNA/single guide RNA and Cas9 ribonucleoprotein complexes, and is envisioned to aid the development of protein replacement and gene correction therapeutics in targeted tissues.",
|
| 32 |
+
"journal": "Nat Nanotechnol",
|
| 33 |
+
"year": 2020,
|
| 34 |
+
"topic": "LNP organ selectivity",
|
| 35 |
+
},
|
| 36 |
+
# ... (додайте всі 20 записів з вашого попереднього коду)
|
| 37 |
+
# Для стислості я показую лише перші два; ви маєте скопіювати повний список.
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
# ─────────────────────────────────────────────
|
| 41 |
+
# RAG ENGINE
|
| 42 |
+
# ─────────────────────────────────────────────
|
| 43 |
+
|
| 44 |
+
_rag_index = None
|
| 45 |
+
_rag_embeddings = None
|
| 46 |
+
_rag_model = None
|
| 47 |
+
EMBED_MODEL = "all-MiniLM-L6-v2"
|
| 48 |
+
|
| 49 |
+
def _build_index():
|
| 50 |
+
global _rag_index, _rag_embeddings, _rag_model
|
| 51 |
+
try:
|
| 52 |
+
from sentence_transformers import SentenceTransformer
|
| 53 |
+
import faiss
|
| 54 |
+
except ImportError:
|
| 55 |
+
return False, "sentence-transformers or faiss-cpu not installed. Run: pip install sentence-transformers faiss-cpu"
|
| 56 |
+
_rag_model = SentenceTransformer(EMBED_MODEL)
|
| 57 |
+
texts = [f"Title: {p['title']}\nAbstract: {p['abstract']}\nJournal: {p['journal']} ({p['year']})" for p in PAPER_CORPUS]
|
| 58 |
+
_rag_embeddings = _rag_model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
|
| 59 |
+
_rag_embeddings = _rag_embeddings / np.linalg.norm(_rag_embeddings, axis=1, keepdims=True)
|
| 60 |
+
dim = _rag_embeddings.shape[1]
|
| 61 |
+
_rag_index = faiss.IndexFlatIP(dim)
|
| 62 |
+
_rag_index.add(_rag_embeddings.astype(np.float32))
|
| 63 |
+
return True, f"Index built: {len(PAPER_CORPUS)} papers, {dim}-dim embeddings"
|
| 64 |
+
|
| 65 |
+
def _confidence_flag(score: float, n_results: int) -> str:
|
| 66 |
+
if score >= 0.55 and n_results >= 2:
|
| 67 |
+
return "🟢 HIGH"
|
| 68 |
+
elif score >= 0.35:
|
| 69 |
+
return "🟡 MEDIUM"
|
| 70 |
+
else:
|
| 71 |
+
return "🔴 SPECULATIVE"
|
| 72 |
+
|
| 73 |
+
def rag_query(question: str, top_k: int = 3) -> str:
|
| 74 |
+
global _rag_index, _rag_model
|
| 75 |
+
if _rag_index is None:
|
| 76 |
+
ok, msg = _build_index()
|
| 77 |
+
if not ok:
|
| 78 |
+
return f"⚠️ RAG system unavailable: {msg}"
|
| 79 |
+
try:
|
| 80 |
+
from sentence_transformers import SentenceTransformer
|
| 81 |
+
import faiss
|
| 82 |
+
except ImportError:
|
| 83 |
+
return "⚠️ Required packages not installed: `pip install sentence-transformers faiss-cpu`"
|
| 84 |
+
q_emb = _rag_model.encode([question], convert_to_numpy=True, show_progress_bar=False)
|
| 85 |
+
q_emb = q_emb / np.linalg.norm(q_emb, axis=1, keepdims=True)
|
| 86 |
+
scores, indices = _rag_index.search(q_emb.astype(np.float32), top_k)
|
| 87 |
+
scores = scores[0]
|
| 88 |
+
indices = indices[0]
|
| 89 |
+
MIN_SCORE = 0.20
|
| 90 |
+
valid = [(s, i) for s, i in zip(scores, indices) if s >= MIN_SCORE and i >= 0]
|
| 91 |
+
if not valid:
|
| 92 |
+
return (
|
| 93 |
+
"❌ **No relevant information found in the indexed papers.**\n\n"
|
| 94 |
+
"This assistant only answers questions based on 20 indexed papers on:\n"
|
| 95 |
+
"- LNP drug delivery (brain/GBM focus)\n"
|
| 96 |
+
"- Protein corona biology\n"
|
| 97 |
+
"- Cancer variants and precision oncology\n"
|
| 98 |
+
"- Liquid biopsy biomarkers\n\n"
|
| 99 |
+
"Please rephrase your question or ask about these topics."
|
| 100 |
+
)
|
| 101 |
+
top_score = valid[0][0]
|
| 102 |
+
confidence = _confidence_flag(top_score, len(valid))
|
| 103 |
+
answer_parts = [f"**Confidence: {confidence}** (retrieval score: {top_score:.3f})\n"]
|
| 104 |
+
for rank, (score, idx) in enumerate(valid, 1):
|
| 105 |
+
paper = PAPER_CORPUS[idx]
|
| 106 |
+
answer_parts.append(
|
| 107 |
+
f"### [{rank}] {paper['title']}\n"
|
| 108 |
+
f"*{paper['journal']}, {paper['year']} | PMID: {paper['pmid']}*\n\n"
|
| 109 |
+
f"{paper['abstract']}\n"
|
| 110 |
+
f"*(Relevance score: {score:.3f})*"
|
| 111 |
+
)
|
| 112 |
+
answer_parts.append(
|
| 113 |
+
"\n---\n"
|
| 114 |
+
"⚠️ *This answer is grounded exclusively in the 20 indexed papers. "
|
| 115 |
+
"For clinical decisions, consult primary literature and domain experts.*"
|
| 116 |
+
)
|
| 117 |
+
journal_log("S1-A·R2e", question, f"retrieved {len(valid)} papers, top score {top_score:.3f}")
|
| 118 |
+
return "\n\n".join(answer_parts)
|
| 119 |
+
|
| 120 |
+
def build_chatbot_tab():
|
| 121 |
+
"""Creates the chatbot UI within a Gradio tab."""
|
| 122 |
+
gr.Markdown(
|
| 123 |
+
"**Status:** Model loads on first query (~30s)...\n\n"
|
| 124 |
+
"Ask questions about LNP delivery, protein corona, cancer variants, or liquid biopsy. "
|
| 125 |
+
"Answers are grounded in 20 indexed papers — never fabricated."
|
| 126 |
+
)
|
| 127 |
+
with gr.Row():
|
| 128 |
+
with gr.Column(scale=3):
|
| 129 |
+
chatbox = gr.Chatbot(label="Research Assistant", height=420, bubble_full_width=False)
|
| 130 |
+
with gr.Row():
|
| 131 |
+
user_input = gr.Textbox(
|
| 132 |
+
placeholder="Ask about LNP delivery, protein corona, cancer variants...",
|
| 133 |
+
label="Your question",
|
| 134 |
+
lines=2,
|
| 135 |
+
scale=4,
|
| 136 |
+
)
|
| 137 |
+
send_btn = gr.Button("Send", variant="primary", scale=1)
|
| 138 |
+
clear_btn = gr.Button("🗑️ Clear conversation", size="sm")
|
| 139 |
+
with gr.Column(scale=1):
|
| 140 |
+
gr.Markdown("### 📚 Indexed Topics")
|
| 141 |
+
gr.Markdown(
|
| 142 |
+
"**LNP Delivery**\n"
|
| 143 |
+
"- mRNA-LNP formulation\n"
|
| 144 |
+
"- Ionizable lipids & pKa\n"
|
| 145 |
+
"- Brain/GBM delivery\n"
|
| 146 |
+
"- Organ selectivity (SORT)\n"
|
| 147 |
+
"- PEG & anti-PEG immunity\n\n"
|
| 148 |
+
"**Protein Corona**\n"
|
| 149 |
+
"- Hard vs soft corona\n"
|
| 150 |
+
"- Vroman effect kinetics\n"
|
| 151 |
+
"- ApoE/LDLR targeting\n\n"
|
| 152 |
+
"**Cancer Variants**\n"
|
| 153 |
+
"- TP53 mutation spectrum\n"
|
| 154 |
+
"- KRAS G12C resistance\n"
|
| 155 |
+
"- ClinVar classification\n\n"
|
| 156 |
+
"**Liquid Biopsy**\n"
|
| 157 |
+
"- ctDNA methylation\n"
|
| 158 |
+
"- cfRNA biomarkers"
|
| 159 |
+
)
|
| 160 |
+
gr.Markdown(
|
| 161 |
+
"### 🔑 Confidence Flags\n"
|
| 162 |
+
"🟢 **HIGH** — strong match (≥0.55)\n"
|
| 163 |
+
"🟡 **MEDIUM** — moderate match (0.35–0.55)\n"
|
| 164 |
+
"🔴 **SPECULATIVE** — weak match (<0.35)\n\n"
|
| 165 |
+
"*Only answers from indexed papers are shown.*"
|
| 166 |
+
)
|
| 167 |
+
def respond(message, history):
|
| 168 |
+
if not message.strip():
|
| 169 |
+
return history, ""
|
| 170 |
+
answer = rag_query(message.strip())
|
| 171 |
+
history = history or []
|
| 172 |
+
history.append((message, answer))
|
| 173 |
+
return history, ""
|
| 174 |
+
send_btn.click(respond, inputs=[user_input, chatbox], outputs=[chatbox, user_input])
|
| 175 |
+
user_input.submit(respond, inputs=[user_input, chatbox], outputs=[chatbox, user_input])
|
| 176 |
+
clear_btn.click(lambda: ([], ""), outputs=[chatbox, user_input])
|
| 177 |
+
|
| 178 |
+
# ────────────────────────────────────────────��
|
| 179 |
+
# STANDALONE MODE
|
| 180 |
+
# ─────────────────────────────────────────────
|
| 181 |
+
if __name__ == "__main__":
|
| 182 |
+
print("Building RAG index...")
|
| 183 |
+
ok, msg = _build_index()
|
| 184 |
+
print(msg)
|
| 185 |
+
|
| 186 |
+
with gr.Blocks(title="K R&D Lab — Research Assistant") as demo:
|
| 187 |
+
gr.Markdown("# 🤖 K R&D Lab Research Assistant\n*Standalone mode*")
|
| 188 |
+
build_chatbot_tab()
|
| 189 |
+
|
| 190 |
+
demo.launch(share=False)
|