import streamlit as st from langchain_community.document_loaders import PDFPlumberLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_core.vectorstores import InMemoryVectorStore from langchain_core.prompts import ChatPromptTemplate from langchain.embeddings import HuggingFaceEmbeddings from langchain.llms import HuggingFaceHub import os st.markdown(""" """, unsafe_allow_html=True) PROMPT_TEMPLATE = """ You are an expert research assistant. Use the provided context to answer the query. If unsure, state that you don't know. Be concise and factual (max 3 sentences). Query: {user_query} Context: {document_context} Answer: """ PDF_STORAGE_PATH = 'document_store/pdfs/' EMBEDDING_MODEL = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2" ) DOCUMENT_VECTOR_DB = InMemoryVectorStore(EMBEDDING_MODEL) HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN") # Model to use MODEL_REPO = "mistralai/Mixtral-8x7B-Instruct-v0.1" # Setup the LLM using LangChain + Hugging Face Inference API LANGUAGE_MODEL = HuggingFaceHub( repo_id=MODEL_REPO, model_kwargs={"temperature": 0.7, "max_new_tokens": 2000}, huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN ) def save_uploaded_file(uploaded_file): file_path = PDF_STORAGE_PATH + uploaded_file.name with open(file_path, "wb") as file: file.write(uploaded_file.getbuffer()) return file_path def load_pdf_documents(file_path): document_loader = PDFPlumberLoader(file_path) return document_loader.load() def chunk_documents(raw_documents): text_processor = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200, add_start_index=True ) return text_processor.split_documents(raw_documents) def index_documents(document_chunks): DOCUMENT_VECTOR_DB.add_documents(document_chunks) def find_related_documents(query): return DOCUMENT_VECTOR_DB.similarity_search(query) def generate_answer(user_query, context_documents): context_text = "\n\n".join([doc.page_content for doc in context_documents]) conversation_prompt = ChatPromptTemplate.from_template(PROMPT_TEMPLATE) response_chain = conversation_prompt | LANGUAGE_MODEL response = response_chain.invoke({"user_query": user_query, "document_context": context_text}) return response # UI Configuration st.title("📘 DocuMind AI") st.markdown("### Your Intelligent Document Assistant") st.markdown("---") # File Upload Section uploaded_pdf = st.file_uploader( "Upload Research Document (PDF)", type="pdf", help="Select a PDF document for analysis", accept_multiple_files=False ) if uploaded_pdf: saved_path = save_uploaded_file(uploaded_pdf) raw_docs = load_pdf_documents(saved_path) processed_chunks = chunk_documents(raw_docs) index_documents(processed_chunks) st.success("✅ Document processed successfully! Ask your questions below.") user_input = st.chat_input("Enter your question about the document...") if user_input: with st.chat_message("user"): st.write(user_input) with st.spinner("Analyzing document..."): relevant_docs = find_related_documents(user_input) ai_response = generate_answer(user_input, relevant_docs) with st.chat_message("assistant", avatar="🤖"): st.write(ai_response)