Spaces:

sango07
/

Chat_with_multiple_PDF

Sleeping

App Files Files Community

sango07 commited on Dec 18, 2024

Commit

4622ed5

verified ·

1 Parent(s): 02ddb5d

Update app.py

Browse files

Files changed (1) hide show

app.py +138 -62

app.py CHANGED Viewed

@@ -16,80 +16,156 @@ from langchain_openai import AzureOpenAIEmbeddings
 from langchain_openai import OpenAIEmbeddings
 from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
 from langchain_openai import ChatOpenAI
-def main():
-    load_dotenv()
-    st.set_page_config(
-        page_title="PDF Insights AI",
-        page_icon=":books:",
-        layout="wide"
     )
-    st.write(css, unsafe_allow_html=True)
-    # Welcome section
-    st.title("📚 PDF Insights AI")
-    st.markdown("""
-    ### Unlock the Knowledge in Your PDFs
-    - 🤖 AI-powered document analysis
-    - 💬 Ask questions about your uploaded documents
-    - 📄 Support for multiple PDF files
-    """)
     if "conversation" not in st.session_state:
         st.session_state.conversation = None
     if "chat_history" not in st.session_state:
         st.session_state.chat_history = []
-    # File upload section
     with st.sidebar:
-        st.header("📤 Upload Documents")
         pdf_docs = st.file_uploader(
-            "Upload your PDFs here",
-            type=['pdf'],
-            accept_multiple_files=True,
-            help="Upload PDF files to analyze. Max file size: 200MB"
-        )
-        # File validation
-        if pdf_docs:
-            for doc in pdf_docs:
-                if doc.size > 200 * 1024 * 1024:  # 200 MB
-                    st.error(f"File {doc.name} is too large. Maximum file size is 200MB.")
-                    pdf_docs.remove(doc)
-        if st.button("Process Documents", type="primary"):
-            if not pdf_docs:
-                st.warning("Please upload at least one PDF file.")
-            else:
-                with st.spinner("Processing your documents..."):
-                    try:
-                        # get pdf text
-                        content, metadata = prepare_docs(pdf_docs)
-                        # get the text chunks
-                        split_docs = get_text_chunks(content, metadata)
-                        # create vector store
-                        vectorstore = ingest_into_vectordb(split_docs)
-                        # create conversation chain
-                        st.session_state.conversation = get_conversation_chain(vectorstore)
-                        st.success("Documents processed successfully! You can now ask questions.")
-                    except Exception as e:
-                        st.error(f"An error occurred while processing documents: {str(e)}")
-    # Question input section
-    user_question = st.text_input(
-        "📝 Ask a question about your documents",
-        placeholder="What insights can you provide from these documents?"
-    )
-    if user_question:
-        if st.session_state.conversation is None:
-            st.warning("Please upload and process documents first.")
-        else:
-            handle_userinput(user_question)

 from langchain_openai import OpenAIEmbeddings
 from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
 from langchain_openai import ChatOpenAI
+os.environ["OPENAI_API_KEY"] = "sk-.............."
+os.environ["GROQ_API_KEY"]="........."
+from langchain_groq import ChatGroq
+llmtemplate = """You’re an AI information specialist with a strong emphasis on extracting accurate information from markdown documents. Your expertise involves summarizing data succinctly while adhering to strict guidelines about neutrality and clarity.
+Your task is to answer a specific question based on a provided markdown document. Here is the question you need to address:
+{question}
+Keep in mind the following instructions:
+- Your response should be direct and factual, limited to 50 words and 2-3 sentences.
+- Avoid using introductory phrases like "yes" or "no."
+- Maintain an ethical and unbiased tone, steering clear of harmful or offensive content.
+- If the document lacks relevant information, respond with "I cannot provide an answer based on the provided document."
+- Do not fabricate information, include questions, or use confirmatory phrases.
+- Remember not to prompt for additional information or ask any questions.
+Ensure your response is strictly based on the content of the markdown document.
+    """
+def prepare_docs(pdf_docs):
+    docs = []
+    metadata = []
+    content = []
+    for pdf in pdf_docs:
+        print(pdf.name)
+        pdf_reader = PyPDF2.PdfReader(pdf)
+        for index, text in enumerate(pdf_reader.pages):
+            doc_page = {'title': pdf.name + " page " + str(index + 1),
+                        'content': pdf_reader.pages[index].extract_text()}
+            docs.append(doc_page)
+    for doc in docs:
+        content.append(doc["content"])
+        metadata.append({
+            "title": doc["title"]
+        })
+    return content, metadata
+def get_text_chunks(content, metadata):
+    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
+        chunk_size=1024,
+        chunk_overlap=256,
     )
+    split_docs = text_splitter.create_documents(content, metadatas=metadata)
+    print(f"Split documents into {len(split_docs)} passages")
+    return split_docs
+def ingest_into_vectordb(split_docs):
+    # embeddings = OpenAIEmbeddings()
+    # embeddings = FastEmbedEmbeddings()
+    embeddings = SpacyEmbeddings(model_name="en_core_web_sm")
+    db = FAISS.from_documents(split_docs, embeddings)
+    DB_FAISS_PATH = 'vectorstore/db_faiss'
+    db.save_local(DB_FAISS_PATH)
+    return db
+def get_conversation_chain(vectordb):
+    # llama_llm = ChatOpenAI(temperature=0.7, model="gpt-3.5-turbo")
+    llm = ChatGroq(model="llama3-70b-8192", temperature=0.25)
+    retriever = vectordb.as_retriever()
+    CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(llmtemplate)
+    memory = ConversationBufferMemory(
+        memory_key='chat_history', return_messages=True, output_key='answer')
+    conversation_chain = (ConversationalRetrievalChain.from_llm
+                          (llm=llm,
+                           retriever=retriever,
+                           #condense_question_prompt=CONDENSE_QUESTION_PROMPT,
+                           memory=memory,
+                           return_source_documents=True))
+    print("Conversational Chain created for the LLM using the vector store")
+    return conversation_chain
+def validate_answer_against_sources(response_answer, source_documents):
+    model = SentenceTransformer('all-MiniLM-L6-v2')
+    similarity_threshold = 0.5
+    source_texts = [doc.page_content for doc in source_documents]
+    answer_embedding = model.encode(response_answer, convert_to_tensor=True)
+    source_embeddings = model.encode(source_texts, convert_to_tensor=True)
+    cosine_scores = util.pytorch_cos_sim(answer_embedding, source_embeddings)
+    if any(score.item() > similarity_threshold for score in cosine_scores[0]):
+        return True
+    return False
+def handle_userinput(user_question):
+    response = st.session_state.conversation({'question': user_question})
+    st.session_state.chat_history = response['chat_history']
+    for i, message in enumerate(st.session_state.chat_history):
+        print(i)
+        if i % 2 == 0:
+            st.write(user_template.replace(
+                "{{MSG}}", message.content), unsafe_allow_html=True)
+        else:
+            print(message.content)
+            st.write(bot_template.replace(
+                "{{MSG}}", message.content), unsafe_allow_html=True)
+def main():
+    load_dotenv()
+    st.set_page_config(page_title="Chat with your PDFs",
+                       page_icon=":books:")
+    st.write(css, unsafe_allow_html=True)
     if "conversation" not in st.session_state:
         st.session_state.conversation = None
     if "chat_history" not in st.session_state:
         st.session_state.chat_history = []
+    st.header("Chat with multiple PDFs :books:")
+    user_question = st.text_input("Ask a question about your documents:")
+    if user_question:
+        handle_userinput(user_question)
     with st.sidebar:
+        st.subheader("Your documents")
         pdf_docs = st.file_uploader(
+            "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
+        if st.button("Process"):
+            with st.spinner("Processing"):
+                # get pdf text
+                content, metadata = prepare_docs(pdf_docs)
+                # get the text chunks
+                split_docs = get_text_chunks(content, metadata)
+                # create vector store
+                vectorstore = ingest_into_vectordb(split_docs)
+                # create conversation chain
+                st.session_state.conversation = get_conversation_chain(
+                    vectorstore)
+if __name__ == '__main__':
+    main()