Spaces:

dimoZ
/

chat.pdf

Build error

App Files Files Community

dimoZ commited on Dec 5, 2024

Commit

c61da78

verified ·

1 Parent(s): 47ae0f2

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -69

app.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import os
 import streamlit as st
 from PyPDF2 import PdfReader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.vectorstores import FAISS
-from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
-from langchain.prompts import PromptTemplate
 from langchain.chains.question_answering import load_qa_chain
 from dotenv import load_dotenv
 from fuzzywuzzy import process
@@ -14,100 +15,114 @@ load_dotenv()
 google_api_key = os.getenv("GOOGLE_API_KEY")
 if google_api_key is None:
     st.error("GOOGLE_API_KEY is not set. Please set it in the .env file.")
-else:
-    from google.generativeai import configure
-    configure(api_key=google_api_key)
-# Global variable to store chat history
-if 'chat_history' not in st.session_state:
     st.session_state.chat_history = []
-# Function to extract text from uploaded PDF files
-def extract_text_from_pdfs(files):
     text = ""
-    for pdf in files:
-        reader = PdfReader(pdf)
-        for page in reader.pages:
             text += page.extract_text()
     return text
-# Function to split text into manageable chunks
 def split_text_into_chunks(text):
-    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
     return splitter.split_text(text)
-# Create and store embeddings
-def create_vector_store(chunks):
     embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
-    vector_store = FAISS.from_texts(chunks, embedding=embeddings)
     vector_store.save_local("faiss_index")
-    return vector_store
-# Load a previously created vector store
-def load_vector_store():
-    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
-    return FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
-# Generate a response using Gemini
-def generate_response(question, vector_store):
     docs = vector_store.similarity_search(question)
-    chain = get_qa_chain()
     response = chain({"input_documents": docs, "question": question}, return_only_outputs=True)
     return response["output_text"]
-# Load the question-answering chain
-def get_qa_chain():
-    prompt = PromptTemplate(
-        template="""
-        Use the provided context to answer the question in detail. If the answer is unavailable, respond with "Answer not found in the provided context."
-        Context:\n{context}\n
-        Question:\n{question}\n
-        Answer:""",
-        input_variables=["context", "question"]
-    )
-    llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.5)
-    return load_qa_chain(llm, chain_type="stuff", prompt=prompt)
-# Suggest questions or keywords dynamically
-def suggest_keywords(query, all_texts):
-    return process.extract(query, all_texts, limit=5)
 # Main app function
 def main():
-    st.set_page_config(page_title="Virtual Agent App", layout="wide")
-    st.title("Virtual Agent Powered by Gemini")
-    # Sidebar for uploading files
     with st.sidebar:
         st.header("Upload Documents")
-        uploaded_files = st.file_uploader("Upload PDFs", type=["pdf"], accept_multiple_files=True)
-        if st.button("Process Files"):
-            if uploaded_files:
-                raw_text = extract_text_from_pdfs(uploaded_files)
-                text_chunks = split_text_into_chunks(raw_text)
                 create_vector_store(text_chunks)
                 st.success("Documents processed successfully!")
             else:
-                st.error("Please upload at least one PDF.")
-    # Main interface for questions and suggestions
-    user_question = st.text_input("Ask your question here (suggestions below):")
-    if user_question:
-        # Load vector store and generate suggestions
-        vector_store = load_vector_store()
-        suggestions = suggest_keywords(user_question, vector_store.similarity_search(user_question, k=10))
-        st.write("Suggestions:", [s[0] for s in suggestions])
-        # Generate and display response
-        if st.button("Submit Question"):
-            response = generate_response(user_question, vector_store)
-            st.write("Answer:", response)
-            st.session_state.chat_history.append({"question": user_question, "answer": response})
-    # Chat history download
     if st.sidebar.button("Download Chat History"):
-        chat_history = "\n".join([f"Q: {item['question']}\nA: {item['answer']}" for item in st.session_state.chat_history])
-        st.sidebar.download_button("Download History", chat_history, file_name="chat_history.txt", mime="text/plain")
 if __name__ == "__main__":
     main()

 import os
 import streamlit as st
 from PyPDF2 import PdfReader
+from docx import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.vectorstores import FAISS
+from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
 from langchain.chains.question_answering import load_qa_chain
+from langchain.prompts import PromptTemplate
 from dotenv import load_dotenv
 from fuzzywuzzy import process
 google_api_key = os.getenv("GOOGLE_API_KEY")
 if google_api_key is None:
     st.error("GOOGLE_API_KEY is not set. Please set it in the .env file.")
+# Configure the Gemini API
+genai.configure(api_key=google_api_key)
+# Global variables
+if "chat_history" not in st.session_state:
     st.session_state.chat_history = []
+# List of predefined questions for suggestions
+suggested_questions = [
+    "What is the revenue of the company?",
+    "Who are the board members?",
+    "What are the key achievements mentioned in the report?",
+    "What is the company's growth strategy?",
+    "What are the major risks highlighted?",
+]
+# Function to extract text from PDF
+def extract_text_from_pdf(pdf_docs):
     text = ""
+    for pdf in pdf_docs:
+        pdf_reader = PdfReader(pdf)
+        for page in pdf_reader.pages:
             text += page.extract_text()
     return text
+# Function to extract text from .docx
+def extract_text_from_docx(docx_docs):
+    text = ""
+    for doc in docx_docs:
+        document = Document(doc)
+        for para in document.paragraphs:
+            text += para.text + "\n"
+    return text
+# Function to split text into chunks
 def split_text_into_chunks(text):
+    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
     return splitter.split_text(text)
+# Function to create vector store
+def create_vector_store(text_chunks):
     embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
+    vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
     vector_store.save_local("faiss_index")
+# Function to load a QA chain
+def load_qa_chain_model():
+    prompt_template = """
+    Use the context provided to answer the question accurately. If the answer is not found, respond with "Answer not available in the context."
+    Context:\n{context}\n
+    Question:\n{question}\n
+    Answer:
+    """
+    model = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.5)
+    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
+    return load_qa_chain(model, chain_type="stuff", prompt=prompt)
+# Function to process user questions
+def process_user_question(question):
+    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
+    vector_store = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
     docs = vector_store.similarity_search(question)
+    chain = load_qa_chain_model()
     response = chain({"input_documents": docs, "question": question}, return_only_outputs=True)
     return response["output_text"]
 # Main app function
 def main():
+    st.set_page_config(page_title="Virtual Agent", layout="wide")
+    st.title("Ask Questions from Your Company Documents")
+    # Real-time suggestion box
+    user_input = st.text_input("Type your question", placeholder="Ask a question...", key="question_input")
+    suggestions = process.extract(user_input, suggested_questions, limit=5) if user_input else []
+    if user_input:
+        st.markdown("**Suggestions:**")
+        for suggestion, _ in suggestions:
+            st.button(suggestion, on_click=lambda s=suggestion: st.session_state.update({"question_input": s}))
+    # Sidebar for file upload
     with st.sidebar:
         st.header("Upload Documents")
+        pdf_docs = st.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True)
+        docx_docs = st.file_uploader("Upload .docx files", type="docx", accept_multiple_files=True)
+        if st.button("Process Documents"):
+            if pdf_docs or docx_docs:
+                st.spinner("Processing...")
+                pdf_text = extract_text_from_pdf(pdf_docs) if pdf_docs else ""
+                docx_text = extract_text_from_docx(docx_docs) if docx_docs else ""
+                combined_text = pdf_text + docx_text
+                text_chunks = split_text_into_chunks(combined_text)
                 create_vector_store(text_chunks)
                 st.success("Documents processed successfully!")
             else:
+                st.error("Please upload at least one document.")
+    # Handle question input and response
+    if user_input:
+        st.spinner("Generating response...")
+        answer = process_user_question(user_input)
+        st.session_state.chat_history.append({"question": user_input, "answer": answer})
+        st.write(f"**Answer:** {answer}")
+    # Chat history download option
     if st.sidebar.button("Download Chat History"):
+        chat_history = "\n".join([f"Q: {entry['question']}\nA: {entry['answer']}" for entry in st.session_state.chat_history])
+        st.sidebar.download_button("Download", chat_history, file_name="chat_history.txt", mime="text/plain")
 if __name__ == "__main__":
     main()