Spaces:

kramachan
/

Medical-Labelling

Sleeping

App Files Files Community

kramachan commited on Feb 25

Commit

0fc1003

verified ·

1 Parent(s): 39626d1

Upload 2 files

Browse files

Files changed (2) hide show

src/ETL_VectorDB.py +102 -0
src/app.py +209 -0

src/ETL_VectorDB.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import os
+import logging
+from langchain_core.documents import Document
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_chroma import Chroma
+from langchain_openai import ChatOpenAI
+import json
+# Set up logging configuration
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+# Get a logger for this module
+logger = logging.getLogger(__name__)
+working_dir = os.path.dirname(os.path.abspath(__file__))
+parent_dir = os.path.dirname(working_dir)
+data_dir = f"{parent_dir}/"
+vector_db_dir = f"{parent_dir}/vector_db"
+logger.info("Reading Files Process Started...")
+all_records = []
+# loop through all files
+for file_name in os.listdir(data_dir):
+    if file_name.endswith(".json"):
+        file_path = os.path.join(data_dir, file_name)
+        with open(file_path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+            # if JSON contains list of records
+            if isinstance(data, list):
+                all_records.extend(data)
+            else:
+                all_records.append(data)
+print("Total drug records:", len(all_records))
+documents = []
+for record in data:
+    drug = record.get("generic_name", ["UNKNOWN"])[0].upper()
+    # choose sections you want in RAG
+    sections = [
+        "indications_and_usage",
+        "warnings_and_cautions",
+        "adverse_reactions",
+        "drug_interactions"
+    ]
+    for section in sections:
+        if section in record:
+            for text in record[section]:
+                documents.append(
+                    Document(
+                        page_content=text,
+                        metadata={
+                            "generic_name": drug,
+                            "section": section
+                        }
+                    )
+                )
+print("Documents created:", len(documents))
+logger.info("Split chunk Files Process Started...")
+splitter = RecursiveCharacterTextSplitter(
+    chunk_size=800,
+    chunk_overlap=150
+)
+chunked_docs = splitter.split_documents(documents)
+print("Chunks created:", len(chunked_docs))
+logger.info("Embeddings Files Process Started...")
+embeddings = HuggingFaceEmbeddings(
+    model_name="sentence-transformers/all-MiniLM-L6-v2"
+)
+#%%
+print("Chroma ready ✅")
+logger.info(" VectorDB Process Started...")
+vectordb = Chroma.from_documents(
+    documents=chunked_docs,
+    embedding=embeddings,
+    persist_directory="./chroma_db"
+)
+print("Vector DB created successfully ✅")
+logger.info("VectorDB Process Completed...")

src/app.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import os
+import logging
+from dotenv import load_dotenv
+import streamlit as st
+from langchain_chroma import Chroma
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_openai import ChatOpenAI
+# Get a logger for this module
+logger = logging.getLogger(__name__)
+logger.info("Design Page...")
+# -------------------------------
+# PAGE CONFIG (MUST BE FIRST)
+# -------------------------------
+PORT = int(os.environ.get("PORT", 8501))
+st.markdown("""
+<style>
+.main-title {
+    font-size: 52px;
+    font-weight: 800;
+    text-align: center;
+    color: #0B5ED7;
+    margin-bottom: 5px;
+}
+.sub-title {
+    font-size: 20px;
+    text-align: center;
+    color: #555555;
+    margin-bottom: 30px;
+}
+</style>
+""", unsafe_allow_html=True)
+st.markdown(
+    '<div class="main-title">💊 AI Medical Labelling System</div>',
+    unsafe_allow_html=True
+)
+st.markdown(
+    '<div class="sub-title">Simplifying FDA Drug Safety Information using Generative AI & RAG</div>',
+    unsafe_allow_html=True
+)
+# -------------------------------
+# CUSTOM CSS (FANCY DESIGN)
+# -------------------------------
+st.markdown("""
+<style>
+.main {
+    background-color: #f7f9fc;
+}
+.big-title {
+    font-size:40px;
+    font-weight:700;
+    color:#1f4e79;
+}
+.subtitle {
+    font-size:18px;
+    color:#555;
+}
+.result-card {
+    background-color:white;
+    padding:20px;
+    border-radius:12px;
+    box-shadow:0px 2px 10px rgba(0,0,0,0.08);
+    margin-top:15px;
+}
+</style>
+""", unsafe_allow_html=True)
+# -------------------------------
+# HEADER
+# -------------------------------
+st.divider()
+# -------------------------------
+# SIDEBAR CONTROLS
+# -------------------------------
+with st.sidebar:
+    st.header("⚙️ Search Options")
+    drug_name = st.text_input(
+        "Drug Name",
+        placeholder="PHENYTOIN SODIUM"
+    )
+    selected_results = st.radio(
+        "Information Type",
+        ["Side Effects", "Warnings", "Both"]
+    )
+    run_button = st.button("🔍 Generate Explanation")
+# -------------------------------
+# LOAD ENV + MODELS
+# -------------------------------
+logger.info("Loading HuggingFace embedding model...")
+load_dotenv()
+working_dir = os.path.dirname(os.path.abspath(__file__))
+embeddings = HuggingFaceEmbeddings(
+    model_name="sentence-transformers/all-MiniLM-L6-v2"
+)
+vectordb = Chroma(
+    persist_directory=os.path.join(working_dir, "Chroma_db"),
+    embedding_function=embeddings
+)
+logger.info("Calling OpenAI model gpt-4o-mini...")
+llm = ChatOpenAI(
+    model="gpt-4o-mini",
+    temperature=0
+)
+# -------------------------------
+# RAG FUNCTION
+# -------------------------------
+def generate_section(drug_name, section, rules):
+    results = vectordb.get(
+        where={
+            "$and": [
+                {"generic_name": drug_name},
+                {"section": section}
+            ]
+        }
+    )
+    documents = results.get("documents", [])
+    if not documents:
+        st.warning(f"No data found for {section}")
+        return
+    context = "\n".join(set(documents))
+    prompt = f"""
+You are a medical assistant.
+Rewrite the FDA drug information into simplified,
+easy-to-understand language.
+Rules:
+{rules}
+Drug: {drug_name}
+FDA TEXT:
+{context}
+"""
+    with st.spinner("🧠 AI is analysing FDA data..."):
+        response = llm.invoke(prompt)
+    st.markdown(
+        f'<div class="result-card">{response.content}</div>',
+        unsafe_allow_html=True
+    )
+logger.info("Configuring prompt..")
+# -------------------------------
+# RULES
+# -------------------------------
+SIDE_EFFECT_RULES = """
+- Use simple English
+- Bullet points (max 7)
+- Group similar side effects
+- Separate common vs serious
+"""
+WARNING_RULES = """
+- Use simple English
+- Bullet points (max 7)
+- Group warnings clearly
+"""
+SECTION_MAP = {
+    "Side Effects": [("adverse_reactions", SIDE_EFFECT_RULES)],
+    "Warnings": [("warnings_and_cautions", WARNING_RULES)],
+    "Both": [
+        ("adverse_reactions", SIDE_EFFECT_RULES),
+        ("warnings_and_cautions", WARNING_RULES),
+    ],
+}
+# -------------------------------
+# MAIN ACTION
+# -------------------------------
+if run_button and drug_name:
+    st.subheader(f"Results for: {drug_name.upper()}")
+    for section, rules in SECTION_MAP[selected_results]:
+        generate_section(drug_name, section, rules)
+elif run_button:
+    st.warning("Please enter a drug name.")