Spaces:

Bhagyajoshi
/

RAG-Application

Sleeping

File size: 6,661 Bytes

f95d6c7
 
cace76b
f95d6c7
 
 
 
 
 
 
 
 
bb6d187
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cace76b
bb6d187
 
 
 
cace76b
bb6d187

# This is for input / output operation
import os
import keyfile
import time
# Warning to be ignored
import warnings
warnings.filterwarnings("ignore")
# This library is for loading textual data
from langchain.document_loaders import TextLoader
# This library will handle the splitting part of the data
from langchain.text_splitter import CharacterTextSplitter
# This library will handle embedding of data
from langchain.embeddings import HuggingFaceEmbeddings
from pinecone import Pinecone, ServerlessSpec
from langchain.llms import HuggingFaceHub
from langchain import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceHub
from langchain.vectorstores import Pinecone


template = """
You are a MLOPs engineer. The user will ask you a question about Machine Learning Operations.
Use the following piece of context to answer the question.
If you don't know the answer, just say don't know/
Keep the answer brief

Context: {context}
Question: {question}
Answer:

"""

def setup_retrieval_qa_system(doc_directory, question, chunk_size=500, chunk_overlap=100):
    load_dotenv()

    hugging_face = keyfile.Hugging_face_key
    if not hugging_face:
        raise ValueError("HuggingFace API key is missing. Please set it in the .env file.")
    os.environ['HUGGINGFACEHUB_API_TOKEN'] = hugging_face

    pc = keyfile.PCToken
    PINECONE_API_KEY = os.getenv("PCToken")
    
    if not pc:
        raise ValueError("pc API key is missing. Please set it in the .env file.")
    os.environ['PCToken'] = pc

    # We are initializing the cloud platform over here
    cloud = os.environ.get("PINECONE_CLOUD") or "aws"
    # We are going to give a region for aws
    region = os.environ.get("PINECONE_REGION") or "us-east-1"
    # Initialize the client
    serv = ServerlessSpec(cloud = cloud, region = region)

    index_name = "Bhagya-27thoct"

    # We are check if the name of our index is not existing in pinecone directory
    if index_name not in pc.list_indexes().names():
      # if not then we will create a index for us
      pc.create_index(
          name = index_name,
          dimension = 768,
          metric = "cosine",
          spec = serv
      )
      # Waiting till the machine has not created the index
      while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)
    
    # Check to see if the index is ready
    print("Index before inserting")
    print(pc.Index(index_name).describe_index_stats())

    all_docs = []
    with st.spinner('Loading and processing documents...'):
        for file_name in os.listdir(doc_directory):
            file_path = os.path.join(doc_directory, file_name)
            loader = PyPDFLoader(file_path)
            docs = loader.load()
            all_docs.extend(docs)

        text_splitter = CharacterTextSplitter(chunk_size = chunk_size, chunk_overlap = chunk_overlap)
        #text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        #splitted_chunks = text_splitter.split_documents(all_docs)
        splitted_chunks = text_splitter.split_documents(all_docs)

        #embedding_model = HuggingFaceInstructEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
        embedding_model = HuggingFaceInstructEmbeddings(model_name="mistralai/Mixtral-8x7B-Instruct-v0.1")
        vector_db = FAISS.from_documents(splitted_chunks, embedding_model)
        retriever = vector_db.as_retriever()
        
        # IF the index is not there in the index list
        if index_name not in pc.list_indexes():
            docsearch = PineconeVectorStore.from_documents(docs, embeddings, index_name = index_name)
        else:
            docsearch = PineconeVectorStore.from_existing_index(index_name, embeddings, pinecone_index = pc.Index(index_name))
            
    llm = HuggingFaceHub(
        repo_id = model_id,
        model_kwargs = {"temperature" : 0.8, "top_k" : 50},
        huggingfacehub_api_token = hugging_face
    )
    #llm = ChatGroq(model="llama3-8b-8192")
    prompt = PromptTemplate(
        template = template,
        input_variables = ["context", "question"]
    )
    rag_chain = (
        {"context" : docsearch.as_retriever(), "question" : RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )

    llm = HuggingFaceHub(
        repo_id=model_id,
        model_kwargs={"temperature": 0.8, "top_k": 50},
        huggingfacehub_api_token=hugging_face
    )

    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=docsearch.as_retriever(),
    )
    #retrieval_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
    with st.spinner('Finding the best answer...'):
        result = qa_chain.run(query)
    
    # with st.spinner('Finding the best answer...'):
    #     result = retrieval_chain.invoke(question)
    
    return result['result']

def main():
    st.title("📝 Document-Based Question Answering System with Groq")

    st.sidebar.header("Configuration")

    # File uploader for PDFs
    uploaded_files = st.sidebar.file_uploader("Upload PDF documents", type="pdf", accept_multiple_files=True)
    
    # Get the document directory from the user
    doc_directory = st.text_input("Or enter the document directory path directly:", "")

    # Set chunk size and overlap
    chunk_size = st.sidebar.slider("Set chunk size", 100, 1000, 500)
    chunk_overlap = st.sidebar.slider("Set chunk overlap", 0, 200, 100)

    # Input for the question
    question = st.text_input("Enter your question:")

    # Button to trigger the QA system
    if st.button("Get Answer"):
        if uploaded_files:
            doc_directory = "/tmp/streamlit_uploaded_docs"
            os.makedirs(doc_directory, exist_ok=True)
            for file in uploaded_files:
                with open(os.path.join(doc_directory, file.name), "wb") as f:
                    f.write(file.getbuffer())
        elif not doc_directory:
            st.warning("Please upload PDF files or provide a document directory.")
            return

        if question:
            try:
                result = setup_retrieval_qa_system(doc_directory, question, chunk_size, chunk_overlap)
                st.success("Answer found!")
                st.write(f"**Answer:** {result}")
            except Exception as e:
                st.error(f"An error occurred: {e}")
        else:
            st.warning("Please provide a question.")

if __name__ == "__main__":
    main()