Spaces:

alperensn
/

PharmaBot

Sleeping

File size: 8,302 Bytes

# =================================================================================
# app.py: Main application file for the Streamlit web interface
# =================================================================================
import streamlit as st
from dotenv import load_dotenv
from huggingface_hub import snapshot_download
import os

# === LlamaIndex persist kontrolü ve dataset'ten indirme yardımcıları ===
LLAMA_INDEX_DATASET_ID = os.getenv("HF_INDEX_DATASET_ID", "alperensn/llamaIndexVectorBase_fda")
LLAMA_INDEX_SUBDIR = os.getenv("HF_INDEX_SUBDIR", "").strip()  # dataset içinde alt klasör kullanıyorsan burada belirt

# Eski ve yeni (default__) adlandırmaları birlikte kontrol edelim
MARKERS_CLASSIC = {"index_store.json", "docstore.json", "graph_store.json", "default__vector_store.json", "image__vector_store.json" }
MARKERS_DEFAULT = {"default__index_store.json", "default__docstore.json", "default__vector_store.json", "default_image__vector_store.json","default__graph_store.json"}


def _persist_path(base_dir: str) -> str:
    return os.path.join(base_dir, LLAMA_INDEX_SUBDIR) if LLAMA_INDEX_SUBDIR else base_dir

def llama_index_exists(base_dir: str) -> bool:
    path = _persist_path(base_dir)
    if not os.path.isdir(path):
        return False
    files = set(os.listdir(path))
    return (MARKERS_CLASSIC.issubset(files) or MARKERS_DEFAULT.issubset(files))


def download_llama_index_if_needed(base_dir: str):
    path = _persist_path(base_dir)
    os.makedirs(path, exist_ok=True)
    if llama_index_exists(base_dir):
        return
    snapshot_download(
        repo_id=LLAMA_INDEX_DATASET_ID,
        repo_type="dataset",
        local_dir=path,
        local_dir_use_symlinks=False,
    )

# İndirilen LlamaIndex persist klasörünü altlarda aramak gerekirse:
def find_llama_index_dir(base_dir: str) -> str:
    wanted_sets = [MARKERS_CLASSIC, MARKERS_DEFAULT]
    if os.path.isdir(base_dir):
        files = set(os.listdir(base_dir))
        if any(ws.issubset(files) for ws in wanted_sets):
            return base_dir
    for root, _, files in os.walk(base_dir):
        files = set(files)
        if any(ws.issubset(files) for ws in wanted_sets):
            return root
    return base_dir


# Load environment variables from .env file
load_dotenv()

# Import the modules we've created
import config
import rag_pipeline  # Now using the LlamaIndex pipeline

# --- Page Configuration ---
st.set_page_config(
    page_title="PharmaBot",
    page_icon="🤖",
    layout="wide",
    initial_sidebar_state="expanded",
)

# --- State Management ---
def initialize_state():
    """Initializes session state variables."""
    if "messages" not in st.session_state:
        st.session_state.messages = [{"role": "assistant", "content": "Welcome to PharmaBot! How can I help you today?"}]
    if "query_engine" not in st.session_state:
        st.session_state.query_engine = None
    if "initialized" not in st.session_state:
        st.session_state.initialized = False

# --- UI Components ---
def setup_sidebar():
    """Sets up the sidebar with app information."""
    with st.sidebar:
        st.header("About PharmaBot")
        st.info(
            "PharmaBot is an AI assistant designed to answer questions about "
            "pharmaceuticals based on a knowledge base of RAG documents. "
            "It uses a Retrieval-Augmented Generation (RAG) pipeline to provide accurate, "
            "context-aware answers."
        )
        st.warning("**Disclaimer: I am an AI assistant, not a medical professional. This information is for educational purposes only. Please consult with a qualified healthcare provider for any health concerns or before making any medical decisions.**"
        )
        st.markdown("---")
        st.header("Technical Details")
        st.markdown(
            f"""
            - **LLM Model:** `{config.LLM_MODEL_ID}`
            - **Embedding Model:** `{config.EMBEDDING_MODEL_NAME}`
            - **Vector Type:** `LLama Index Vector Store`
            - **Vector Store:** `{config.VECTOR_STORE_PATH}`
            """
        )

def display_chat_history():
    """Displays the chat history."""
    for message in st.session_state.messages:
        with st.chat_message(message["role"]):
            st.write(message["content"])

def handle_user_input(chat_engine):
    """Handles user input and displays the response."""
    if prompt := st.chat_input("Ask me anything about pharmaceuticals..."):
        st.session_state.messages.append({"role": "user", "content": prompt})
        with st.chat_message("user"):
            st.write(prompt)

        with st.chat_message("assistant"):
            with st.spinner("Thinking..."):
                response = chat_engine.chat(prompt)
                st.write(str(response))

        st.session_state.messages.append({"role": "assistant", "content": str(response)})

import time
from build_knowledge_base import build_vector_store
import os

# --- Main Application Logic ---
def main():
    """Main function to run the Streamlit app."""
    st.set_page_config(page_title="PharmaBot Assistant", page_icon="💊")
    initialize_state()
    st.title("💊 PharmaBot: Your AI Pharmaceutical Assistant")
    setup_sidebar()

    # Initialize the RAG pipeline if it hasn't been already
    if not st.session_state.initialized:

        # 1) Önce dataset'ten persist edilmiş index'i yerel klasöre indirmeyi dene
        if not llama_index_exists(config.LLAMA_INDEX_STORE_PATH):
            with st.status("Knowledge base not found locally. Downloading from dataset...", expanded=True) as status:
                try:
                    status.write(f"Downloading persisted index from: {LLAMA_INDEX_DATASET_ID}")
                    download_llama_index_if_needed(config.LLAMA_INDEX_STORE_PATH)
                    detected_dir = find_llama_index_dir(config.LLAMA_INDEX_STORE_PATH)
                    if detected_dir != config.LLAMA_INDEX_STORE_PATH:
                        config.LLAMA_INDEX_STORE_PATH = detected_dir
                    status.update(label="Index downloaded from dataset.", state="complete", expanded=False)
                    time.sleep(1)
                except Exception as e:
                    status.update(label="Dataset download failed. Falling back to local build...", state="running", expanded=True)
                    try:
                        status.write("This is a one-time setup and may take a few minutes...")
                        build_vector_store()
                        status.update(label="Knowledge base built successfully!", state="complete", expanded=False)
                        time.sleep(1)
                    except Exception as be:
                        status.update(label="Build Failed", state="error", expanded=True)
                        st.error(f"An error occurred while preparing the knowledge base:\n- dataset error: {e}\n- build error: {be}")
                        st.stop()

        # 2) RAG pipeline init
        with st.status("Initializing the RAG pipeline...", expanded=True) as status:
            try:
                status.write("Step 1/3: Initializing LLM and embedding models...")
                rag_pipeline.initialize_llm_and_embed_model()

                status.write("Step 2/3: Loading vector index from storage...")
                index = rag_pipeline.load_vector_index()  # load_vector_index() zaten config.LLAMA_INDEX_STORE_PATH'ten okuyorsa değişiklik gerekmez

                status.write("Step 3/3: Building the conversational chat engine...")
                st.session_state.query_engine = rag_pipeline.build_query_engine(index)

                st.session_state.initialized = True
                status.update(label="Initialization Complete!", state="complete", expanded=False)
                time.sleep(1)
            except Exception as e:
                status.update(label="Initialization Failed", state="error")
                st.error(f"An unexpected error occurred during initialization: {e}")
                return

        st.rerun()


    # Display chat and handle input if initialized
    if st.session_state.initialized:
        display_chat_history()
        handle_user_input(st.session_state.query_engine)

if __name__ == "__main__":
    main()