Spaces:

Harsh12
/

rag-assignment

Sleeping

App Files Files Community

Harsh12 commited on Sep 18, 2024

Commit

b0b3cb1

verified ·

1 Parent(s): 04df341

Upload 5 files

Browse files

Files changed (5) hide show

app.py +94 -0
doc_preprocessing.py +37 -0
indexing.py +41 -0
requirements.txt +0 -0
retrieval.py +32 -0

app.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# app.py
+import streamlit as st
+from doc_preprocessing import load_and_split_document
+from indexing import initialize_pinecone, delete_index
+from retrieval import retrieve_documents
+from langchain_cohere import CohereEmbeddings, ChatCohere
+from dotenv import load_dotenv
+import os
+import time
+from langchain_pinecone import PineconeVectorStore
+load_dotenv()
+# Set API keys
+cohere_api = os.getenv("COHERE_API_KEY")
+pinecone_api = os.getenv("PINECONE_API_KEY")
+cohere_chat_model = ChatCohere(cohere_api_key=cohere_api)
+cohere_embeddings = CohereEmbeddings(cohere_api_key=cohere_api, user_agent="my-app", model="embed-english-v2.0")
+def pretty_print_docs(docs):
+    return "\n\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)])
+# Initialize session state
+if "index_name" not in st.session_state:
+    st.session_state.index_name = None
+if "retriever" not in st.session_state:
+    st.session_state.retriever = None
+st.title("RAG-Based Document Search with LangChain")
+# Upload PDF or DOCX document
+uploaded_file = st.file_uploader("Upload a PDF or DOCX Document", type=["pdf", "docx"])
+# Input for user query
+query = st.text_input("Ask a question related to the uploaded document:")
+if uploaded_file is not None and st.session_state.index_name is None:
+    # Detect file type
+    file_type = uploaded_file.name.split(".")[-1].lower()
+    # Create a unique index name for the session
+    user_index = f"user-{str(time.time()).replace('.', '-')}"
+    st.session_state.index_name = user_index
+    # # Save the uploaded file to the "data" directory
+    # file_path = os.path.join("C:/Users/ADMIN/Desktop/rag_assignment/data", uploaded_file.name)
+    # with open(file_path, "wb") as f:
+    #     f.write(uploaded_file.getbuffer())
+    # Save the uploaded file to a container-friendly path
+    file_path = os.path.join("data", uploaded_file.name)  # Use relative path
+    os.makedirs("data", exist_ok=True)  # Create the 'data' directory if it doesn't exist
+    with open(file_path, "wb") as f:
+        f.write(uploaded_file.getbuffer())
+    # Load and split the document, converting if necessary
+    documents = load_and_split_document(file_path, file_type)
+    # Initialize Pinecone index
+    index = initialize_pinecone(pinecone_api_key=pinecone_api, index_name=user_index)
+    db = PineconeVectorStore.from_documents(
+        documents=documents,
+        embedding=cohere_embeddings,
+        index_name=user_index,
+    )
+    # Store the retriever in session state
+    st.session_state.retriever = db.as_retriever(search_kwargs={"k": 5})
+    st.write("Data Indexed Successfully")
+# Add a submit button for query input
+if st.session_state.retriever:
+    if st.button("Submit"):
+        # Retrieve documents based on the query
+        result = retrieve_documents(query=query, retriever=st.session_state.retriever, llm=cohere_chat_model)
+        st.header("Response:")
+        st.write(result["answer"])
+        st.write("-------------------------------------------------------------------")
+        st.header("Context:")
+        if "I don't know" in result["answer"]:
+            st.markdown("Can't fetch the context!!")
+        else:
+            st.markdown(pretty_print_docs(result["context"]))
+# Clean up index when user ends the session
+if st.button("End Session and Delete Index"):
+    if st.session_state.index_name:
+        delete_index(st.session_state.index_name, pinecone_api)
+        st.success(f"Index '{st.session_state.index_name}' deleted.")
+        st.session_state.index_name = None
+        st.session_state.retriever = None

doc_preprocessing.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# document_processing.py
+from langchain_community.document_loaders import PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from docx import Document
+import pdfkit
+def convert_docx_to_pdf(docx_file, pdf_file):
+    """
+    Convert .docx file to a .pdf using pdfkit.
+    """
+    document = Document(docx_file)
+    document.save(f"{docx_file}")
+    # Convert the docx file to pdf using pdfkit
+    pdfkit.from_file(docx_file, pdf_file)
+def load_and_split_document(file_path, file_type):
+    """
+    Handles PDF and DOCX files. If DOCX, it converts to PDF first,
+    then processes the document.
+    """
+    # Convert DOCX to PDF if necessary
+    if file_type == "docx":
+        pdf_file = file_path.replace(".docx", ".pdf")
+        convert_docx_to_pdf(file_path, pdf_file)
+        file_path = pdf_file  # Update file path to newly created PDF
+    # Load the PDF document
+    loader = PyPDFLoader(file_path)
+    raw_documents = loader.load()
+    # Chunk the text using recursive character splitter
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=200)
+    documents = text_splitter.split_documents(raw_documents)
+    return documents

indexing.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# indexing.py
+from pinecone import Pinecone, ServerlessSpec
+import time
+# Initialize Pinecone and create unique index
+def initialize_pinecone(pinecone_api_key, index_name):
+    spec = ServerlessSpec(
+        cloud="aws",
+        region="us-east-1"
+    )
+    pinecone_api = pinecone_api_key
+    pc = Pinecone(api_key=pinecone_api)
+    existing_indexes = [
+    index_info["name"] for index_info in pc.list_indexes()]
+    # check if index already exists (it shouldn't if this is first time)
+    if index_name not in existing_indexes:
+        # if does not exist, create index
+        pc.create_index(
+            index_name,
+            dimension=4096,  # dimensionality of ada 002
+            metric='dotproduct',
+            spec=spec
+        )
+        # wait for index to be initialized
+        while not pc.describe_index(index_name).status['ready']:
+            time.sleep(1)
+    # connect to index
+    index = pc.Index(index_name)
+    time.sleep(1)
+    return index
+# Delete Pinecone index when user quits
+def delete_index(index_name, pinecone_api_key):
+    pc = Pinecone(api_key=pinecone_api_key)
+    pc.delete_index(index_name)

requirements.txt ADDED Viewed

Binary file (3.91 kB). View file

retrieval.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# retrieval.py
+from langchain.retrievers import ContextualCompressionRetriever
+from langchain_cohere import CohereRerank
+from langchain_core.prompts import ChatPromptTemplate
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain.chains import create_retrieval_chain
+def retrieve_documents(query, retriever, llm):
+    # Apply Cohere reranking model
+    compressor = CohereRerank(model="rerank-english-v3.0")
+    compression_retriever = ContextualCompressionRetriever(
+        base_compressor=compressor, base_retriever=retriever
+    )
+    prompt = """You are a good assistant that answers questions. Your knowledge is strictly limited to the following piece of context. Use it to answer the question at the end.
+    If the answer can't be found in the context, just say you don't know. *DO NOT* try to make up an answer.
+    If the question is not related to the context, politely respond that you are tuned to only answer questions that are related to the context.
+    **MOST IMPORTANT: If question is not related to the context, just say "I don't know".**
+    Context: {context}
+    Question: {input}
+    """
+    prompt_template = ChatPromptTemplate.from_template(prompt)
+    document_chain = create_stuff_documents_chain(llm, prompt_template)
+    retrieval_chain = create_retrieval_chain(compression_retriever, document_chain)
+    response = retrieval_chain.invoke({"input":query})
+    return response