File size: 3,697 Bytes
b0b3cb1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# app.py
import streamlit as st
from doc_preprocessing import load_and_split_document
from indexing import initialize_pinecone, delete_index
from retrieval import retrieve_documents
from langchain_cohere import CohereEmbeddings, ChatCohere
from dotenv import load_dotenv
import os
import time
from langchain_pinecone import PineconeVectorStore

load_dotenv()

# Set API keys
cohere_api = os.getenv("COHERE_API_KEY")
pinecone_api = os.getenv("PINECONE_API_KEY")
cohere_chat_model = ChatCohere(cohere_api_key=cohere_api)
cohere_embeddings = CohereEmbeddings(cohere_api_key=cohere_api, user_agent="my-app", model="embed-english-v2.0")

def pretty_print_docs(docs):
    return "\n\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)])

# Initialize session state
if "index_name" not in st.session_state:
    st.session_state.index_name = None
if "retriever" not in st.session_state:
    st.session_state.retriever = None

st.title("RAG-Based Document Search with LangChain")

# Upload PDF or DOCX document
uploaded_file = st.file_uploader("Upload a PDF or DOCX Document", type=["pdf", "docx"])

# Input for user query
query = st.text_input("Ask a question related to the uploaded document:")

if uploaded_file is not None and st.session_state.index_name is None:
    # Detect file type
    file_type = uploaded_file.name.split(".")[-1].lower()

    # Create a unique index name for the session
    user_index = f"user-{str(time.time()).replace('.', '-')}"
    st.session_state.index_name = user_index

    # # Save the uploaded file to the "data" directory
    # file_path = os.path.join("C:/Users/ADMIN/Desktop/rag_assignment/data", uploaded_file.name)
    # with open(file_path, "wb") as f:
    #     f.write(uploaded_file.getbuffer())
    # Save the uploaded file to a container-friendly path
    file_path = os.path.join("data", uploaded_file.name)  # Use relative path
    os.makedirs("data", exist_ok=True)  # Create the 'data' directory if it doesn't exist

    with open(file_path, "wb") as f:
        f.write(uploaded_file.getbuffer())

    # Load and split the document, converting if necessary
    documents = load_and_split_document(file_path, file_type)
    
    # Initialize Pinecone index
    index = initialize_pinecone(pinecone_api_key=pinecone_api, index_name=user_index)
    db = PineconeVectorStore.from_documents(
        documents=documents,
        embedding=cohere_embeddings,
        index_name=user_index,
    )
    
    # Store the retriever in session state
    st.session_state.retriever = db.as_retriever(search_kwargs={"k": 5})
    st.write("Data Indexed Successfully")

# Add a submit button for query input
if st.session_state.retriever:
    if st.button("Submit"):
        # Retrieve documents based on the query
        result = retrieve_documents(query=query, retriever=st.session_state.retriever, llm=cohere_chat_model)
        
        st.header("Response:")
        st.write(result["answer"])

        st.write("-------------------------------------------------------------------")
        
        st.header("Context:")
        if "I don't know" in result["answer"]:
            st.markdown("Can't fetch the context!!")
        else:
            st.markdown(pretty_print_docs(result["context"]))

# Clean up index when user ends the session
if st.button("End Session and Delete Index"):
    if st.session_state.index_name:
        delete_index(st.session_state.index_name, pinecone_api)
        st.success(f"Index '{st.session_state.index_name}' deleted.")
        st.session_state.index_name = None
        st.session_state.retriever = None