File size: 6,661 Bytes
f95d6c7
 
cace76b
f95d6c7
 
 
 
 
 
 
 
 
bb6d187
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cace76b
bb6d187
 
 
 
cace76b
bb6d187
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# This is for input / output operation
import os
import keyfile
import time
# Warning to be ignored
import warnings
warnings.filterwarnings("ignore")
# This library is for loading textual data
from langchain.document_loaders import TextLoader
# This library will handle the splitting part of the data
from langchain.text_splitter import CharacterTextSplitter
# This library will handle embedding of data
from langchain.embeddings import HuggingFaceEmbeddings
from pinecone import Pinecone, ServerlessSpec
from langchain.llms import HuggingFaceHub
from langchain import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceHub
from langchain.vectorstores import Pinecone


template = """
You are a MLOPs engineer. The user will ask you a question about Machine Learning Operations.
Use the following piece of context to answer the question.
If you don't know the answer, just say don't know/
Keep the answer brief

Context: {context}
Question: {question}
Answer:

"""

def setup_retrieval_qa_system(doc_directory, question, chunk_size=500, chunk_overlap=100):
    load_dotenv()

    hugging_face = keyfile.Hugging_face_key
    if not hugging_face:
        raise ValueError("HuggingFace API key is missing. Please set it in the .env file.")
    os.environ['HUGGINGFACEHUB_API_TOKEN'] = hugging_face

    pc = keyfile.PCToken
    PINECONE_API_KEY = os.getenv("PCToken")
    
    if not pc:
        raise ValueError("pc API key is missing. Please set it in the .env file.")
    os.environ['PCToken'] = pc

    # We are initializing the cloud platform over here
    cloud = os.environ.get("PINECONE_CLOUD") or "aws"
    # We are going to give a region for aws
    region = os.environ.get("PINECONE_REGION") or "us-east-1"
    # Initialize the client
    serv = ServerlessSpec(cloud = cloud, region = region)

    index_name = "Bhagya-27thoct"

    # We are check if the name of our index is not existing in pinecone directory
    if index_name not in pc.list_indexes().names():
      # if not then we will create a index for us
      pc.create_index(
          name = index_name,
          dimension = 768,
          metric = "cosine",
          spec = serv
      )
      # Waiting till the machine has not created the index
      while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)
    
    # Check to see if the index is ready
    print("Index before inserting")
    print(pc.Index(index_name).describe_index_stats())

    all_docs = []
    with st.spinner('Loading and processing documents...'):
        for file_name in os.listdir(doc_directory):
            file_path = os.path.join(doc_directory, file_name)
            loader = PyPDFLoader(file_path)
            docs = loader.load()
            all_docs.extend(docs)

        text_splitter = CharacterTextSplitter(chunk_size = chunk_size, chunk_overlap = chunk_overlap)
        #text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        #splitted_chunks = text_splitter.split_documents(all_docs)
        splitted_chunks = text_splitter.split_documents(all_docs)

        #embedding_model = HuggingFaceInstructEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
        embedding_model = HuggingFaceInstructEmbeddings(model_name="mistralai/Mixtral-8x7B-Instruct-v0.1")
        vector_db = FAISS.from_documents(splitted_chunks, embedding_model)
        retriever = vector_db.as_retriever()
        
        # IF the index is not there in the index list
        if index_name not in pc.list_indexes():
            docsearch = PineconeVectorStore.from_documents(docs, embeddings, index_name = index_name)
        else:
            docsearch = PineconeVectorStore.from_existing_index(index_name, embeddings, pinecone_index = pc.Index(index_name))
            
    llm = HuggingFaceHub(
        repo_id = model_id,
        model_kwargs = {"temperature" : 0.8, "top_k" : 50},
        huggingfacehub_api_token = hugging_face
    )
    #llm = ChatGroq(model="llama3-8b-8192")
    prompt = PromptTemplate(
        template = template,
        input_variables = ["context", "question"]
    )
    rag_chain = (
        {"context" : docsearch.as_retriever(), "question" : RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )

    llm = HuggingFaceHub(
        repo_id=model_id,
        model_kwargs={"temperature": 0.8, "top_k": 50},
        huggingfacehub_api_token=hugging_face
    )

    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=docsearch.as_retriever(),
    )
    #retrieval_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
    with st.spinner('Finding the best answer...'):
        result = qa_chain.run(query)
    
    # with st.spinner('Finding the best answer...'):
    #     result = retrieval_chain.invoke(question)
    
    return result['result']

def main():
    st.title("📝 Document-Based Question Answering System with Groq")

    st.sidebar.header("Configuration")

    # File uploader for PDFs
    uploaded_files = st.sidebar.file_uploader("Upload PDF documents", type="pdf", accept_multiple_files=True)
    
    # Get the document directory from the user
    doc_directory = st.text_input("Or enter the document directory path directly:", "")

    # Set chunk size and overlap
    chunk_size = st.sidebar.slider("Set chunk size", 100, 1000, 500)
    chunk_overlap = st.sidebar.slider("Set chunk overlap", 0, 200, 100)

    # Input for the question
    question = st.text_input("Enter your question:")

    # Button to trigger the QA system
    if st.button("Get Answer"):
        if uploaded_files:
            doc_directory = "/tmp/streamlit_uploaded_docs"
            os.makedirs(doc_directory, exist_ok=True)
            for file in uploaded_files:
                with open(os.path.join(doc_directory, file.name), "wb") as f:
                    f.write(file.getbuffer())
        elif not doc_directory:
            st.warning("Please upload PDF files or provide a document directory.")
            return

        if question:
            try:
                result = setup_retrieval_qa_system(doc_directory, question, chunk_size, chunk_overlap)
                st.success("Answer found!")
                st.write(f"**Answer:** {result}")
            except Exception as e:
                st.error(f"An error occurred: {e}")
        else:
            st.warning("Please provide a question.")

if __name__ == "__main__":
    main()