# app.py import streamlit as st from doc_preprocessing import load_and_split_document from indexing import initialize_pinecone, delete_index from retrieval import retrieve_documents from langchain_cohere import CohereEmbeddings, ChatCohere from dotenv import load_dotenv import os import time from langchain_pinecone import PineconeVectorStore load_dotenv() # Set API keys cohere_api = os.getenv("COHERE_API_KEY") pinecone_api = os.getenv("PINECONE_API_KEY") cohere_chat_model = ChatCohere(cohere_api_key=cohere_api) cohere_embeddings = CohereEmbeddings(cohere_api_key=cohere_api, user_agent="my-app", model="embed-english-v2.0") def pretty_print_docs(docs): return "\n\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]) # Initialize session state if "index_name" not in st.session_state: st.session_state.index_name = None if "retriever" not in st.session_state: st.session_state.retriever = None st.title("RAG-Based Document Search with LangChain") # Upload PDF or DOCX document uploaded_file = st.file_uploader("Upload a PDF or DOCX Document", type=["pdf", "docx"]) # Input for user query query = st.text_input("Ask a question related to the uploaded document:") if uploaded_file is not None and st.session_state.index_name is None: # Detect file type file_type = uploaded_file.name.split(".")[-1].lower() # Create a unique index name for the session user_index = f"user-{str(time.time()).replace('.', '-')}" st.session_state.index_name = user_index # # Save the uploaded file to the "data" directory # file_path = os.path.join("C:/Users/ADMIN/Desktop/rag_assignment/data", uploaded_file.name) # with open(file_path, "wb") as f: # f.write(uploaded_file.getbuffer()) # Save the uploaded file to a container-friendly path file_path = os.path.join("data", uploaded_file.name) # Use relative path os.makedirs("data", exist_ok=True) # Create the 'data' directory if it doesn't exist with open(file_path, "wb") as f: f.write(uploaded_file.getbuffer()) # Load and split the document, converting if necessary documents = load_and_split_document(file_path, file_type) # Initialize Pinecone index index = initialize_pinecone(pinecone_api_key=pinecone_api, index_name=user_index) db = PineconeVectorStore.from_documents( documents=documents, embedding=cohere_embeddings, index_name=user_index, ) # Store the retriever in session state st.session_state.retriever = db.as_retriever(search_kwargs={"k": 5}) st.write("Data Indexed Successfully") # Add a submit button for query input if st.session_state.retriever: if st.button("Submit"): # Retrieve documents based on the query result = retrieve_documents(query=query, retriever=st.session_state.retriever, llm=cohere_chat_model) st.header("Response:") st.write(result["answer"]) st.write("-------------------------------------------------------------------") st.header("Context:") if "I don't know" in result["answer"]: st.markdown("Can't fetch the context!!") else: st.markdown(pretty_print_docs(result["context"])) # Clean up index when user ends the session if st.button("End Session and Delete Index"): if st.session_state.index_name: delete_index(st.session_state.index_name, pinecone_api) st.success(f"Index '{st.session_state.index_name}' deleted.") st.session_state.index_name = None st.session_state.retriever = None