swayam-the-coder commited on
Commit
e619a67
·
verified ·
1 Parent(s): e31a079

Upload 4 files

Browse files
Files changed (4) hide show
  1. .streamlit/config.toml +6 -0
  2. LICENSE +21 -0
  3. app.py +411 -0
  4. requirements.txt +11 -0
.streamlit/config.toml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ [theme]
2
+ primaryColor = "#FF6F61"
3
+ backgroundColor = "#272727"
4
+ secondaryBackgroundColor = "#1F2023"
5
+ textColor = "#FFFFFF"
6
+ font = "Roboto"
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Swayam Agrawal
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
app.py ADDED
@@ -0,0 +1,411 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain_community.document_loaders import WebBaseLoader
5
+ from langchain_community.vectorstores import FAISS
6
+ from langchain_openai import OpenAIEmbeddings, ChatOpenAI
7
+ from pprint import pprint
8
+ from langchain_core.prompts import ChatPromptTemplate
9
+ from langchain_core.pydantic_v1 import BaseModel, Field
10
+ from langchain import hub
11
+ from langchain_core.output_parsers import StrOutputParser
12
+ from typing import List
13
+ from typing_extensions import TypedDict
14
+ from langgraph.graph import StateGraph, END
15
+
16
+ # Streamlit setup with new theme and typography
17
+ st.set_page_config(page_title="SELF-RAG Workflow Application", page_icon="🤖", layout="centered")
18
+ st.markdown(
19
+ """
20
+ <style>
21
+ .main {
22
+ background-color: #272727;
23
+ font-family: 'Helvetica Neue', sans-serif;
24
+ }
25
+ .sidebar .sidebar-content {
26
+ background-color: #2E3944;
27
+ color: #ffffff;
28
+ }
29
+ h1 {
30
+ color: #14A76C;
31
+ }
32
+ .stTextInput {
33
+ border: 1px solid #272727;
34
+ border-radius: 5px;
35
+ }
36
+ </style>
37
+ """,
38
+ unsafe_allow_html=True
39
+ )
40
+
41
+ # Sidebar with instructions and API key input
42
+ st.sidebar.title("Instructions")
43
+ st.sidebar.write("""
44
+ 1. Enter your OpenAI API Key.
45
+ 2. Enter your question in the text box.
46
+ 3. Provide URLs for the documents you want to use.
47
+ 4. Click on the 'Run Workflow' button.
48
+ 5. View the results below.
49
+ """)
50
+ api_key = st.sidebar.text_input("Enter your OpenAI API Key:", type="password")
51
+
52
+ st.title("SELF-RAG Workflow Application")
53
+ input_text = st.text_input("Enter your question : ")
54
+ urls_input = st.text_area("Enter URLs (one per line) :")
55
+ urls = [url.strip() for url in urls_input.split('\n') if url.strip()]
56
+ inputs = {"question": input_text, "transform_attempts": 0}
57
+
58
+ if st.button("Run Workflow"):
59
+ if not api_key:
60
+ st.error("Please enter your OpenAI API Key.")
61
+ elif not urls:
62
+ st.error("Please provide at least one URL.")
63
+ elif not input_text:
64
+ st.error("Please enter a question.")
65
+ else:
66
+ # Document loading and processing
67
+ try:
68
+ texts = []
69
+ docs = []
70
+ for url in urls:
71
+ try:
72
+ docs.extend(WebBaseLoader(url).load())
73
+ except Exception as e:
74
+ st.error(f"Error loading document from {url}: {e}")
75
+ if not docs:
76
+ st.error("No documents loaded. Please check the URLs.")
77
+ else:
78
+ text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
79
+ chunk_size=250, chunk_overlap=0
80
+ )
81
+ doc_splits = text_splitter.split_documents(docs)
82
+
83
+ # Add to vectorDB
84
+ vectorstore = FAISS.from_documents(
85
+ documents=doc_splits,
86
+ embedding=OpenAIEmbeddings(openai_api_key=api_key),
87
+ )
88
+ retriever = vectorstore.as_retriever()
89
+
90
+ ### Retrieval Grader
91
+ # Data model
92
+ class GradeDocuments(BaseModel):
93
+ """Binary score for relevance check on retrieved documents."""
94
+ binary_score: str = Field(description="Documents are relevant to the question, 'yes' or 'no'")
95
+
96
+ # LLM with function call
97
+ llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0, openai_api_key=api_key)
98
+ structured_llm_grader = llm.with_structured_output(GradeDocuments)
99
+
100
+ # Prompt
101
+ system = """You are a grader assessing relevance of a retrieved document to a user question. \n
102
+ It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
103
+ If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n
104
+ Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."""
105
+ grade_prompt = ChatPromptTemplate.from_messages(
106
+ [
107
+ ("system", system),
108
+ ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
109
+ ]
110
+ )
111
+
112
+ retrieval_grader = grade_prompt | structured_llm_grader
113
+ question = input_text
114
+ docs = retriever.get_relevant_documents(question)
115
+ if not docs:
116
+ st.error("No relevant documents found for the question.")
117
+ else:
118
+ doc_txt = docs[1].page_content
119
+
120
+ ### Generate
121
+ # Prompt
122
+ prompt = hub.pull("rlm/rag-prompt")
123
+
124
+ # LLM
125
+ llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key=api_key)
126
+
127
+ # Post-processing
128
+ def format_docs(docs):
129
+ return "\n\n".join(doc.page_content for doc in docs)
130
+
131
+ # Chain
132
+ rag_chain = prompt | llm | StrOutputParser()
133
+
134
+ # Run
135
+ generation = rag_chain.invoke({"context": docs, "question": question})
136
+
137
+ ### Hallucination Grader
138
+ # Data model
139
+ class GradeHallucinations(BaseModel):
140
+ """Binary score for hallucination present in generation answer."""
141
+ binary_score: str = Field(description="Answer is grounded in the facts, 'yes' or 'no'")
142
+
143
+ # LLM with function call
144
+ llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0, openai_api_key=api_key)
145
+ structured_llm_grader = llm.with_structured_output(GradeHallucinations)
146
+
147
+ # Prompt
148
+ system = """You are a grader assessing whether an LLM generation is grounded in / supported by a set of retrieved facts. \n
149
+ Give a binary score 'yes' or 'no'. 'Yes' means that the answer is grounded in / supported by the set of facts."""
150
+ hallucination_prompt = ChatPromptTemplate.from_messages(
151
+ [
152
+ ("system", system),
153
+ ("human", "Set of facts: \n\n {documents} \n\n LLM generation: {generation}"),
154
+ ]
155
+ )
156
+
157
+ hallucination_grader = hallucination_prompt | structured_llm_grader
158
+
159
+ ### Answer Grader
160
+ # Data model
161
+ class GradeAnswer(BaseModel):
162
+ """Binary score to assess answer addresses question."""
163
+ binary_score: str = Field(description="Answer addresses the question, 'yes' or 'no'")
164
+
165
+ # LLM with function call
166
+ llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0, openai_api_key=api_key)
167
+ structured_llm_grader = llm.with_structured_output(GradeAnswer)
168
+
169
+ # Prompt
170
+ system = """You are a grader assessing whether an answer addresses / resolves a question \n
171
+ Give a binary score 'yes' or 'no'. Yes' means that the answer resolves the question."""
172
+ answer_prompt = ChatPromptTemplate.from_messages(
173
+ [
174
+ ("system", system),
175
+ ("human", "User question: \n\n {question} \n\n LLM generation: {generation}"),
176
+ ]
177
+ )
178
+
179
+ answer_grader = answer_prompt | structured_llm_grader
180
+
181
+ ### Question Re-writer
182
+ # LLM
183
+ llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0, openai_api_key=api_key)
184
+
185
+ # Prompt
186
+ system = """You a question re-writer that converts an input question to a better version that is optimized \n
187
+ for vectorstore retrieval. Look at the input and try to reason about the underlying semantic intent / meaning."""
188
+ re_write_prompt = ChatPromptTemplate.from_messages(
189
+ [
190
+ ("system", system),
191
+ (
192
+ "human",
193
+ "Here is the initial question: \n\n {question} \n Formulate an improved question.",
194
+ ),
195
+ ]
196
+ )
197
+
198
+ question_rewriter = re_write_prompt | llm | StrOutputParser()
199
+
200
+ class GraphState(TypedDict):
201
+ """
202
+ Represents the state of our graph.
203
+
204
+ Attributes:
205
+ question: question
206
+ generation: LLM generation
207
+ documents: list of documents
208
+ transform_attempts: int
209
+ """
210
+ question: str
211
+ generation: str
212
+ documents: List[str]
213
+ transform_attempts: int
214
+
215
+ ### Nodes
216
+ def retrieve(state):
217
+ """
218
+ Retrieve documents
219
+
220
+ Args:
221
+ state (dict): The current graph state
222
+
223
+ Returns:
224
+ state (dict): New key added to state, documents, that contains retrieved documents
225
+ """
226
+ texts.append("---RETRIEVE---")
227
+ question = state["question"]
228
+
229
+ # Retrieval
230
+ documents = retriever.get_relevant_documents(question)
231
+ return {"documents": documents, "question": question, "transform_attempts": state.get("transform_attempts", 0)}
232
+
233
+ def generate(state):
234
+ """
235
+ Generate answer
236
+
237
+ Args:
238
+ state (dict): The current graph state
239
+
240
+ Returns:
241
+ state (dict): New key added to state, generation, that contains LLM generation
242
+ """
243
+ texts.append("---GENERATE---")
244
+ question = state["question"]
245
+ documents = state["documents"]
246
+
247
+ # RAG generation
248
+ generation = rag_chain.invoke({"context": documents, "question": question})
249
+ return {"documents": documents, "question": question, "generation": generation, "transform_attempts": state.get("transform_attempts", 0)}
250
+
251
+ def grade_documents(state):
252
+ """
253
+ Determines whether the retrieved documents are relevant to the question.
254
+
255
+ Args:
256
+ state (dict): The current graph state
257
+
258
+ Returns:
259
+ state (dict): Updates documents key with only filtered relevant documents
260
+ """
261
+ texts.append("---CHECK DOCUMENT RELEVANCE TO QUESTION---")
262
+ question = state["question"]
263
+ documents = state["documents"]
264
+
265
+ # Score each doc
266
+ filtered_docs = []
267
+ for d in documents:
268
+ score = retrieval_grader.invoke(
269
+ {"question": question, "document": d.page_content}
270
+ )
271
+ grade = score.binary_score
272
+ if grade == "yes":
273
+ texts.append("---GRADE: DOCUMENT RELEVANT---")
274
+ filtered_docs.append(d)
275
+ else:
276
+ texts.append("---GRADE: DOCUMENT NOT RELEVANT---")
277
+ continue
278
+ return {"documents": filtered_docs, "question": question, "transform_attempts": state.get("transform_attempts", 0)}
279
+
280
+ def transform_query(state):
281
+ """
282
+ Transform the query to produce a better question.
283
+
284
+ Args:
285
+ state (dict): The current graph state
286
+
287
+ Returns:
288
+ state (dict): Updates question key with a re-phrased question
289
+ """
290
+ texts.append("---TRANSFORM QUERY---")
291
+ question = state["question"]
292
+ documents = state["documents"]
293
+
294
+ # Re-write question
295
+ better_question = question_rewriter.invoke({"question": question})
296
+ return {"documents": documents, "question": better_question, "transform_attempts": state.get("transform_attempts", 0) + 1}
297
+
298
+ ### Edges
299
+ def decide_to_generate(state):
300
+ """
301
+ Determines whether to generate an answer, or re-generate a question.
302
+
303
+ Args:
304
+ state (dict): The current graph state
305
+
306
+ Returns:
307
+ str: Binary decision for next node to call
308
+ """
309
+ texts.append("---ASSESS GRADED DOCUMENTS---")
310
+ filtered_documents = state["documents"]
311
+
312
+ if not filtered_documents:
313
+ if state.get("transform_attempts", 0) >= 3:
314
+ return "conclude_no_answer"
315
+ else:
316
+ # All documents have been filtered check_relevance
317
+ # We will re-generate a new query
318
+ texts.append(
319
+ "---DECISION: ALL DOCUMENTS ARE NOT RELEVANT TO QUESTION, TRANSFORM QUERY---"
320
+ )
321
+ return "transform_query"
322
+ else:
323
+ # We have relevant documents, so generate answer
324
+ texts.append("---DECISION: GENERATE---")
325
+ return "generate"
326
+
327
+ def grade_generation_v_documents_and_question(state):
328
+ """
329
+ Determines whether the generation is grounded in the document and answers question.
330
+
331
+ Args:
332
+ state (dict): The current graph state
333
+
334
+ Returns:
335
+ str: Decision for next node to call
336
+ """
337
+ texts.append("---CHECK HALLUCINATIONS---")
338
+ question = state["question"]
339
+ documents = state["documents"]
340
+ generation = state["generation"]
341
+
342
+ score = hallucination_grader.invoke(
343
+ {"documents": documents, "generation": generation}
344
+ )
345
+ grade = score.binary_score
346
+
347
+ # Check hallucination
348
+ if grade == "yes":
349
+ texts.append("---DECISION: GENERATION IS GROUNDED IN DOCUMENTS---")
350
+ # Check question-answering
351
+ texts.append("---GRADE GENERATION vs QUESTION---")
352
+ score = answer_grader.invoke({"question": question, "generation": generation})
353
+ grade = score.binary_score
354
+ if grade == "yes":
355
+ texts.append("---DECISION: GENERATION ADDRESSES QUESTION---")
356
+ return "useful"
357
+ else:
358
+ texts.append("---DECISION: GENERATION DOES NOT ADDRESS QUESTION---")
359
+ return "not useful"
360
+ else:
361
+ texts.append("---DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY---")
362
+ return "not supported"
363
+
364
+ workflow = StateGraph(GraphState)
365
+
366
+ # Define the nodes
367
+ workflow.add_node("retrieve", retrieve) # retrieve
368
+ workflow.add_node("grade_documents", grade_documents) # grade documents
369
+ workflow.add_node("generate", generate) # generate
370
+ workflow.add_node("transform_query", transform_query) # transform_query
371
+ workflow.add_node("conclude_no_answer", lambda state: {"question": state["question"], "generation": "I don't know the answer since none of the given documents are relevant to the question.", "documents": [], "transform_attempts": state.get("transform_attempts", 0)})
372
+
373
+ # Build graph
374
+ workflow.set_entry_point("retrieve")
375
+ workflow.add_edge("retrieve", "grade_documents")
376
+ workflow.add_conditional_edges(
377
+ "grade_documents",
378
+ decide_to_generate,
379
+ {
380
+ "transform_query": "transform_query",
381
+ "generate": "generate",
382
+ "conclude_no_answer": "conclude_no_answer"
383
+ },
384
+ )
385
+ workflow.add_edge("transform_query", "retrieve")
386
+ workflow.add_conditional_edges(
387
+ "generate",
388
+ grade_generation_v_documents_and_question,
389
+ {
390
+ "not supported": "generate",
391
+ "useful": END,
392
+ "not useful": "transform_query",
393
+ },
394
+ )
395
+
396
+ # Compile
397
+ app = workflow.compile()
398
+
399
+ try:
400
+ for output in app.stream(inputs):
401
+ for key, value in output.items():
402
+ for i in texts:
403
+ st.write(i)
404
+ texts = []
405
+ # Final generation
406
+ st.write('## Final Answer')
407
+ st.write(value["generation"])
408
+ except Exception as e:
409
+ st.error(f"Error in workflow execution: {e}")
410
+ except Exception as e:
411
+ st.error(f"Error in document processing: {e}")
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ langchain
3
+ langchain_community
4
+ langchain_openai
5
+ langchain_core
6
+ langgraph
7
+ pydantic
8
+ typing-extensions
9
+ faiss-cpu
10
+ openai
11
+ tiktoken