Spaces:

SyedBasitAbbas
/

RAG

Sleeping

App Files Files Community

SyedBasitAbbas commited on Dec 12, 2024

Commit

57f5dc8

verified ·

1 Parent(s): 9d597d5

Upload 2 files

Browse files

Files changed (2) hide show

SimpleRAG.py +135 -0
app.py +96 -0

SimpleRAG.py ADDED Viewed

	@@ -0,0 +1,135 @@

+# import Libraries
+import openai
+import langchain
+import pinecone
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.vectorstores import Pinecone
+from langchain.llms import OpenAI
+from langchain_community.document_loaders import DirectoryLoader
+from langchain_community.document_loaders import UnstructuredWordDocumentLoader
+from langchain_openai import ChatOpenAI
+from dotenv import load_dotenv
+load_dotenv()
+## Lets Read the document
+def read_doc(directory):
+    loader = DirectoryLoader(
+        directory,
+        glob="**/*.docx",  # This will match .docx files
+        loader_cls=UnstructuredWordDocumentLoader
+    )
+    documents = loader.load()
+    return documents
+import os
+doc = read_doc('documents/')
+print(f"Loaded {len(doc)} documents")
+def chunk_data(docs, chunk_size=800, chunk_overlap=50):
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+        length_function=len,
+        is_separator_regex=False,
+    )
+    # Split documents and maintain document identity
+    chunks = text_splitter.split_documents(docs)
+    # Print information about the chunks
+    print(f"Split {len(docs)} documents into {len(chunks)} chunks")
+    for i, chunk in enumerate(chunks):
+        print(f"Chunk {i}: Source: {chunk.metadata['source']}, Length: {len(chunk.page_content)} chars")
+    return chunks  # Return chunks instead of original docs
+documents=chunk_data(docs=doc)
+len(documents)
+## Embedding Technique Of OPENAI
+embeddings=OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY'])
+embeddings
+vectors=embeddings.embed_query("How are you?")
+len(vectors)
+## Vector Search DB In Pinecone
+import pinecone
+pc = pinecone.Pinecone(
+    api_key="s_jb2Enoqd32qMqAZHGtT3BlbkFJUSYttAQpCkEFzWehIwE3HYwtUpR8TCgI0juyjCfLd1V8yKoPBDBuOTrlzJ26veRHI538W38p4A"
+)
+index_name = "advrag"
+index = Pinecone.from_documents(
+    documents,
+    embeddings,
+    index_name=index_name
+)
+## Cosine Similarity Retreive Results from VectorDB
+def retrieve_query(query,k=2):
+    matching_results=index.similarity_search(query,k=k)
+    return matching_results
+from langchain.chains.question_answering import load_qa_chain
+from langchain_openai import OpenAI
+from langchain.chains import RetrievalQA
+from langchain.prompts import PromptTemplate
+def initialize_qa_chain():
+    llm = ChatOpenAI(
+        model_name="gpt-4",
+        temperature=0.5
+    )
+    prompt_template = """
+    System: You are a helpful AI assistant that provides accurate and concise answers based on the given context. Always cite the specific source document when providing information.
+    Context: {context}
+    Question: {question}
+    Please provide a clear and direct answer based on the context above. If the information isn't available in the context, say so.
+    """
+    PROMPT = PromptTemplate(
+        template=prompt_template,
+        input_variables=["context", "question"]
+    )
+    chain = load_qa_chain(llm, chain_type="stuff", prompt=PROMPT)
+    return chain
+qa_chain = None
+def retrieve_answers(query, k=2):
+    global qa_chain
+    if qa_chain is None:
+        qa_chain = initialize_qa_chain()
+    try:
+        # Get relevant documents
+        matching_docs = retrieve_query(query, k=k)
+        # Create the input dictionary
+        chain_input = {
+            "input_documents": matching_docs,
+            "question": query
+        }
+        # Use invoke instead of __call__
+        result = qa_chain.invoke(chain_input)
+        return result['output_text']
+    except Exception as e:
+        return f"Error processing query: {str(e)}"
+# Test the function
+our_query = "Identify the homework items that the client agreed to complete in each of the two coaching sessions."
+answer = retrieve_answers(our_query)
+print("\nAnswer:", answer)

app.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import streamlit as st
+import os
+import shutil
+from SimpleRAG import read_doc, chunk_data, retrieve_answers
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+# Initialize session state
+if 'docs_processed' not in st.session_state:
+    st.session_state['docs_processed'] = False
+# Set page config
+st.set_page_config(
+    page_title="Document Q&A System",
+    page_icon="📚",
+    layout="wide"
+)
+# Title and description
+st.title("📚 Document Question & Answer System")
+st.markdown("""
+This application allows you to upload documents and ask questions about their content.
+The system uses advanced RAG (Retrieval Augmented Generation) to provide accurate answers.
+""")
+# Check for required environment variables
+if not os.environ.get('OPENAI_API_KEY'):
+    st.error("⚠️ OPENAI_API_KEY is not set in the environment variables!")
+    st.stop()
+# Sidebar for document upload
+with st.sidebar:
+    st.header("Document Upload")
+    uploaded_files = st.file_uploader(
+        "Upload your documents (DOCX format)",
+        type=['docx'],
+        accept_multiple_files=True
+    )
+    if uploaded_files:
+        # Create/clear documents directory
+        if os.path.exists('documents'):
+            shutil.rmtree('documents')
+        os.makedirs('documents')
+        # Save uploaded files
+        for uploaded_file in uploaded_files:
+            try:
+                with open(os.path.join('documents', uploaded_file.name), 'wb') as f:
+                    f.write(uploaded_file.getbuffer())
+                st.success(f"✅ Successfully uploaded: {uploaded_file.name}")
+            except Exception as e:
+                st.error(f"❌ Error uploading {uploaded_file.name}: {str(e)}")
+        if st.button("Process Documents"):
+            try:
+                with st.spinner("Processing documents..."):
+                    # Read and process documents
+                    documents = read_doc('documents/')
+                    if not documents:
+                        st.error("❌ No valid documents found in the uploaded files.")
+                        st.session_state['docs_processed'] = False
+                    else:
+                        chunks = chunk_data(documents)
+                        st.session_state['docs_processed'] = True
+                        st.success(f"✅ Successfully processed {len(documents)} documents into {len(chunks)} chunks!")
+            except Exception as e:
+                st.error(f"❌ Error processing documents: {str(e)}")
+                st.session_state['docs_processed'] = False
+# Main content area
+st.header("Ask Questions")
+# Input for user question
+user_question = st.text_input("Enter your question about the documents:", key="user_question")
+# Process question
+if user_question:
+    if st.session_state.get('docs_processed', False):
+        try:
+            with st.spinner("Finding answer..."):
+                answer = retrieve_answers(user_question)
+                # Display answer in a nice format
+                st.markdown("### Answer")
+                st.write(answer)
+        except Exception as e:
+            st.error(f"❌ Error generating answer: {str(e)}")
+    else:
+        st.warning("⚠️ Please upload and process documents first!")
+# Footer
+st.markdown("---")
+st.markdown("*Powered by OpenAI and Pinecone*")