Bhagyajoshi commited on
Commit
bb6d187
·
verified ·
1 Parent(s): f95d6c7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +168 -1
app.py CHANGED
@@ -9,4 +9,171 @@ from langchain.document_loaders import TextLoader
9
  # This library will handle the splitting part of the data
10
  from langchain.text_splitter import CharacterTextSplitter
11
  # This library will handle embedding of data
12
- from langchain.embeddings import HuggingFaceEmbeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  # This library will handle the splitting part of the data
10
  from langchain.text_splitter import CharacterTextSplitter
11
  # This library will handle embedding of data
12
+ from langchain.embeddings import HuggingFaceEmbeddings
13
+ from pinecone import Pinecone, ServerlessSpec
14
+ from langchain.llms import HuggingFaceHub
15
+ from langchain import PromptTemplate
16
+ from langchain.schema.runnable import RunnablePassthrough
17
+ from langchain.schema.output_parser import StrOutputParser
18
+
19
+ from langchain.chains import RetrievalQA
20
+ from langchain.llms import HuggingFaceHub
21
+ from langchain.vectorstores import Pinecone
22
+
23
+
24
+ template = """
25
+ You are a MLOPs engineer. The user will ask you a question about Machine Learning Operations.
26
+ Use the following piece of context to answer the question.
27
+ If you don't know the answer, just say don't know/
28
+ Keep the answer brief
29
+
30
+ Context: {context}
31
+ Question: {question}
32
+ Answer:
33
+
34
+ """
35
+
36
+ def setup_retrieval_qa_system(doc_directory, question, chunk_size=500, chunk_overlap=100):
37
+ load_dotenv()
38
+
39
+ hugging_face = os.getenv("Hugging_face_key")
40
+ if not hugging_face:
41
+ raise ValueError("HuggingFace API key is missing. Please set it in the .env file.")
42
+ os.environ['HUGGINGFACEHUB_API_TOKEN'] = hugging_face
43
+
44
+ pc = os.getenv("PCToken")
45
+ PINECONE_API_KEY = os.getenv("PCToken")
46
+
47
+ if not pc:
48
+ raise ValueError("pc API key is missing. Please set it in the .env file.")
49
+ os.environ['PCToken'] = pc
50
+
51
+ # We are initializing the cloud platform over here
52
+ cloud = os.environ.get("PINECONE_CLOUD") or "aws"
53
+ # We are going to give a region for aws
54
+ region = os.environ.get("PINECONE_REGION") or "us-east-1"
55
+ # Initialize the client
56
+ serv = ServerlessSpec(cloud = cloud, region = region)
57
+
58
+ index_name = "Bhagya-27thoct"
59
+
60
+ # We are check if the name of our index is not existing in pinecone directory
61
+ if index_name not in pc.list_indexes().names():
62
+ # if not then we will create a index for us
63
+ pc.create_index(
64
+ name = index_name,
65
+ dimension = 768,
66
+ metric = "cosine",
67
+ spec = serv
68
+ )
69
+ # Waiting till the machine has not created the index
70
+ while not pc.describe_index(index_name).status['ready']:
71
+ time.sleep(1)
72
+
73
+ # Check to see if the index is ready
74
+ print("Index before inserting")
75
+ print(pc.Index(index_name).describe_index_stats())
76
+
77
+ all_docs = []
78
+ with st.spinner('Loading and processing documents...'):
79
+ for file_name in os.listdir(doc_directory):
80
+ file_path = os.path.join(doc_directory, file_name)
81
+ loader = PyPDFLoader(file_path)
82
+ docs = loader.load()
83
+ all_docs.extend(docs)
84
+
85
+ text_splitter = CharacterTextSplitter(chunk_size = chunk_size, chunk_overlap = chunk_overlap)
86
+ #text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
87
+ #splitted_chunks = text_splitter.split_documents(all_docs)
88
+ splitted_chunks = text_splitter.split_documents(all_docs)
89
+
90
+ #embedding_model = HuggingFaceInstructEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
91
+ embedding_model = HuggingFaceInstructEmbeddings(model_name="mistralai/Mixtral-8x7B-Instruct-v0.1")
92
+ vector_db = FAISS.from_documents(splitted_chunks, embedding_model)
93
+ retriever = vector_db.as_retriever()
94
+
95
+ # IF the index is not there in the index list
96
+ if index_name not in pc.list_indexes():
97
+ docsearch = PineconeVectorStore.from_documents(docs, embeddings, index_name = index_name)
98
+ else:
99
+ docsearch = PineconeVectorStore.from_existing_index(index_name, embeddings, pinecone_index = pc.Index(index_name))
100
+
101
+ llm = HuggingFaceHub(
102
+ repo_id = model_id,
103
+ model_kwargs = {"temperature" : 0.8, "top_k" : 50},
104
+ huggingfacehub_api_token = hugging_face
105
+ )
106
+ #llm = ChatGroq(model="llama3-8b-8192")
107
+ prompt = PromptTemplate(
108
+ template = template,
109
+ input_variables = ["context", "question"]
110
+ )
111
+ rag_chain = (
112
+ {"context" : docsearch.as_retriever(), "question" : RunnablePassthrough()}
113
+ | prompt
114
+ | llm
115
+ | StrOutputParser()
116
+ )
117
+
118
+ llm = HuggingFaceHub(
119
+ repo_id=model_id,
120
+ model_kwargs={"temperature": 0.8, "top_k": 50},
121
+ huggingfacehub_api_token=hugging_face
122
+ )
123
+
124
+ qa_chain = RetrievalQA.from_chain_type(
125
+ llm=llm,
126
+ chain_type="stuff",
127
+ retriever=docsearch.as_retriever(),
128
+ )
129
+ #retrieval_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
130
+ with st.spinner('Finding the best answer...'):
131
+ result = qa_chain.run(query)
132
+
133
+ # with st.spinner('Finding the best answer...'):
134
+ # result = retrieval_chain.invoke(question)
135
+
136
+ return result['result']
137
+
138
+ def main():
139
+ st.title("📝 Document-Based Question Answering System with Groq")
140
+
141
+ st.sidebar.header("Configuration")
142
+
143
+ # File uploader for PDFs
144
+ uploaded_files = st.sidebar.file_uploader("Upload PDF documents", type="pdf", accept_multiple_files=True)
145
+
146
+ # Get the document directory from the user
147
+ doc_directory = st.text_input("Or enter the document directory path directly:", "")
148
+
149
+ # Set chunk size and overlap
150
+ chunk_size = st.sidebar.slider("Set chunk size", 100, 1000, 500)
151
+ chunk_overlap = st.sidebar.slider("Set chunk overlap", 0, 200, 100)
152
+
153
+ # Input for the question
154
+ question = st.text_input("Enter your question:")
155
+
156
+ # Button to trigger the QA system
157
+ if st.button("Get Answer"):
158
+ if uploaded_files:
159
+ doc_directory = "/tmp/streamlit_uploaded_docs"
160
+ os.makedirs(doc_directory, exist_ok=True)
161
+ for file in uploaded_files:
162
+ with open(os.path.join(doc_directory, file.name), "wb") as f:
163
+ f.write(file.getbuffer())
164
+ elif not doc_directory:
165
+ st.warning("Please upload PDF files or provide a document directory.")
166
+ return
167
+
168
+ if question:
169
+ try:
170
+ result = setup_retrieval_qa_system(doc_directory, question, chunk_size, chunk_overlap)
171
+ st.success("Answer found!")
172
+ st.write(f"**Answer:** {result}")
173
+ except Exception as e:
174
+ st.error(f"An error occurred: {e}")
175
+ else:
176
+ st.warning("Please provide a question.")
177
+
178
+ if __name__ == "__main__":
179
+ main()