SyedBasitAbbas commited on
Commit
57f5dc8
Β·
verified Β·
1 Parent(s): 9d597d5

Upload 2 files

Browse files
Files changed (2) hide show
  1. SimpleRAG.py +135 -0
  2. app.py +96 -0
SimpleRAG.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import Libraries
2
+
3
+ import openai
4
+ import langchain
5
+ import pinecone
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain.embeddings.openai import OpenAIEmbeddings
8
+ from langchain.vectorstores import Pinecone
9
+ from langchain.llms import OpenAI
10
+ from langchain_community.document_loaders import DirectoryLoader
11
+ from langchain_community.document_loaders import UnstructuredWordDocumentLoader
12
+ from langchain_openai import ChatOpenAI
13
+
14
+
15
+ from dotenv import load_dotenv
16
+ load_dotenv()
17
+
18
+
19
+ ## Lets Read the document
20
+ def read_doc(directory):
21
+ loader = DirectoryLoader(
22
+ directory,
23
+ glob="**/*.docx", # This will match .docx files
24
+ loader_cls=UnstructuredWordDocumentLoader
25
+ )
26
+ documents = loader.load()
27
+ return documents
28
+
29
+
30
+ import os
31
+ doc = read_doc('documents/')
32
+ print(f"Loaded {len(doc)} documents")
33
+
34
+ def chunk_data(docs, chunk_size=800, chunk_overlap=50):
35
+ text_splitter = RecursiveCharacterTextSplitter(
36
+ chunk_size=chunk_size,
37
+ chunk_overlap=chunk_overlap,
38
+ length_function=len,
39
+ is_separator_regex=False,
40
+ )
41
+ # Split documents and maintain document identity
42
+ chunks = text_splitter.split_documents(docs)
43
+
44
+ # Print information about the chunks
45
+ print(f"Split {len(docs)} documents into {len(chunks)} chunks")
46
+ for i, chunk in enumerate(chunks):
47
+ print(f"Chunk {i}: Source: {chunk.metadata['source']}, Length: {len(chunk.page_content)} chars")
48
+
49
+ return chunks # Return chunks instead of original docs
50
+
51
+ documents=chunk_data(docs=doc)
52
+ len(documents)
53
+
54
+ ## Embedding Technique Of OPENAI
55
+ embeddings=OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY'])
56
+ embeddings
57
+
58
+ vectors=embeddings.embed_query("How are you?")
59
+ len(vectors)
60
+
61
+ ## Vector Search DB In Pinecone
62
+ import pinecone
63
+
64
+ pc = pinecone.Pinecone(
65
+ api_key="s_jb2Enoqd32qMqAZHGtT3BlbkFJUSYttAQpCkEFzWehIwE3HYwtUpR8TCgI0juyjCfLd1V8yKoPBDBuOTrlzJ26veRHI538W38p4A"
66
+ )
67
+ index_name = "advrag"
68
+
69
+ index = Pinecone.from_documents(
70
+ documents,
71
+ embeddings,
72
+ index_name=index_name
73
+ )
74
+
75
+ ## Cosine Similarity Retreive Results from VectorDB
76
+ def retrieve_query(query,k=2):
77
+ matching_results=index.similarity_search(query,k=k)
78
+ return matching_results
79
+
80
+ from langchain.chains.question_answering import load_qa_chain
81
+ from langchain_openai import OpenAI
82
+ from langchain.chains import RetrievalQA
83
+ from langchain.prompts import PromptTemplate
84
+
85
+ def initialize_qa_chain():
86
+ llm = ChatOpenAI(
87
+ model_name="gpt-4",
88
+ temperature=0.5
89
+ )
90
+
91
+ prompt_template = """
92
+ System: You are a helpful AI assistant that provides accurate and concise answers based on the given context. Always cite the specific source document when providing information.
93
+
94
+ Context: {context}
95
+
96
+ Question: {question}
97
+
98
+ Please provide a clear and direct answer based on the context above. If the information isn't available in the context, say so.
99
+ """
100
+
101
+ PROMPT = PromptTemplate(
102
+ template=prompt_template,
103
+ input_variables=["context", "question"]
104
+ )
105
+
106
+ chain = load_qa_chain(llm, chain_type="stuff", prompt=PROMPT)
107
+ return chain
108
+
109
+ qa_chain = None
110
+
111
+ def retrieve_answers(query, k=2):
112
+ global qa_chain
113
+ if qa_chain is None:
114
+ qa_chain = initialize_qa_chain()
115
+
116
+ try:
117
+ # Get relevant documents
118
+ matching_docs = retrieve_query(query, k=k)
119
+
120
+ # Create the input dictionary
121
+ chain_input = {
122
+ "input_documents": matching_docs,
123
+ "question": query
124
+ }
125
+
126
+ # Use invoke instead of __call__
127
+ result = qa_chain.invoke(chain_input)
128
+ return result['output_text']
129
+ except Exception as e:
130
+ return f"Error processing query: {str(e)}"
131
+
132
+ # Test the function
133
+ our_query = "Identify the homework items that the client agreed to complete in each of the two coaching sessions."
134
+ answer = retrieve_answers(our_query)
135
+ print("\nAnswer:", answer)
app.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import shutil
4
+ from SimpleRAG import read_doc, chunk_data, retrieve_answers
5
+ from dotenv import load_dotenv
6
+
7
+ # Load environment variables
8
+ load_dotenv()
9
+
10
+ # Initialize session state
11
+ if 'docs_processed' not in st.session_state:
12
+ st.session_state['docs_processed'] = False
13
+
14
+ # Set page config
15
+ st.set_page_config(
16
+ page_title="Document Q&A System",
17
+ page_icon="πŸ“š",
18
+ layout="wide"
19
+ )
20
+
21
+ # Title and description
22
+ st.title("πŸ“š Document Question & Answer System")
23
+ st.markdown("""
24
+ This application allows you to upload documents and ask questions about their content.
25
+ The system uses advanced RAG (Retrieval Augmented Generation) to provide accurate answers.
26
+ """)
27
+
28
+ # Check for required environment variables
29
+ if not os.environ.get('OPENAI_API_KEY'):
30
+ st.error("⚠️ OPENAI_API_KEY is not set in the environment variables!")
31
+ st.stop()
32
+
33
+ # Sidebar for document upload
34
+ with st.sidebar:
35
+ st.header("Document Upload")
36
+ uploaded_files = st.file_uploader(
37
+ "Upload your documents (DOCX format)",
38
+ type=['docx'],
39
+ accept_multiple_files=True
40
+ )
41
+
42
+ if uploaded_files:
43
+ # Create/clear documents directory
44
+ if os.path.exists('documents'):
45
+ shutil.rmtree('documents')
46
+ os.makedirs('documents')
47
+
48
+ # Save uploaded files
49
+ for uploaded_file in uploaded_files:
50
+ try:
51
+ with open(os.path.join('documents', uploaded_file.name), 'wb') as f:
52
+ f.write(uploaded_file.getbuffer())
53
+ st.success(f"βœ… Successfully uploaded: {uploaded_file.name}")
54
+ except Exception as e:
55
+ st.error(f"❌ Error uploading {uploaded_file.name}: {str(e)}")
56
+
57
+ if st.button("Process Documents"):
58
+ try:
59
+ with st.spinner("Processing documents..."):
60
+ # Read and process documents
61
+ documents = read_doc('documents/')
62
+ if not documents:
63
+ st.error("❌ No valid documents found in the uploaded files.")
64
+ st.session_state['docs_processed'] = False
65
+ else:
66
+ chunks = chunk_data(documents)
67
+ st.session_state['docs_processed'] = True
68
+ st.success(f"βœ… Successfully processed {len(documents)} documents into {len(chunks)} chunks!")
69
+ except Exception as e:
70
+ st.error(f"❌ Error processing documents: {str(e)}")
71
+ st.session_state['docs_processed'] = False
72
+
73
+ # Main content area
74
+ st.header("Ask Questions")
75
+
76
+ # Input for user question
77
+ user_question = st.text_input("Enter your question about the documents:", key="user_question")
78
+
79
+ # Process question
80
+ if user_question:
81
+ if st.session_state.get('docs_processed', False):
82
+ try:
83
+ with st.spinner("Finding answer..."):
84
+ answer = retrieve_answers(user_question)
85
+
86
+ # Display answer in a nice format
87
+ st.markdown("### Answer")
88
+ st.write(answer)
89
+ except Exception as e:
90
+ st.error(f"❌ Error generating answer: {str(e)}")
91
+ else:
92
+ st.warning("⚠️ Please upload and process documents first!")
93
+
94
+ # Footer
95
+ st.markdown("---")
96
+ st.markdown("*Powered by OpenAI and Pinecone*")