PDF_QA_Streamlit_App_3

Build error

App Files Files Community

rbbist commited on Jun 28, 2025

Commit

29058d8

verified ·

1 Parent(s): 548663f

Update app.py

Browse files

Files changed (1) hide show

app.py +131 -123

app.py CHANGED Viewed

@@ -2,8 +2,7 @@ import streamlit as st
 import os
 import tempfile
 from typing import List, Optional
-from pathlib import Path
-import time
 # Core libraries
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
@@ -13,8 +12,7 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.schema import Document
 from langchain import PromptTemplate
 from langchain.chains import RetrievalQA
-from langchain_pinecone import PineconeVectorStore
-from pinecone import Pinecone as PineconeClient
 # Document loaders
 from langchain.document_loaders import PyPDFLoader
@@ -81,54 +79,39 @@ if 'chat_history' not in st.session_state:
 def setup_llm(model_name="google/flan-t5-small"):
     """Setup the language model for text generation"""
     with st.spinner("🤖 Loading language model..."):
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
-        pipe = pipeline(
-            "text2text-generation",
-            model=model,
-            tokenizer=tokenizer,
-            max_new_tokens=300,
-            temperature=0.3,
-            do_sample=True
-        )
-        llm = HuggingFacePipeline(pipeline=pipe)
-        return llm
 @st.cache_resource
 def setup_embeddings(model_name="all-MiniLM-L6-v2"):
     """Setup the embedding model for vector generation"""
     with st.spinner("🔢 Loading embedding model..."):
-        embeddings = HuggingFaceEmbeddings(model_name=model_name)
-        return embeddings
-def setup_pinecone(api_key, environment="us-east-1", index_name="pdf-rag-system"):
-    """Setup Pinecone vector database connection"""
-    try:
-        os.environ["PINECONE_API_KEY"] = api_key
-        os.environ["PINECONE_ENVIRONMENT"] = environment
-        pc = PineconeClient(api_key=api_key, environment=environment)
-        existing_indexes = pc.list_indexes()
-        if index_name not in [idx.name for idx in existing_indexes]:
-            st.info(f"📝 Creating new index: {index_name}")
-            pc.create_index(
-                name=index_name,
-                dimension=384,
-                metric='cosine'
-            )
-            time.sleep(30)  # Wait for index to be ready
-        return pc, index_name
-    except Exception as e:
-        st.error(f"❌ Error setting up Pinecone: {e}")
-        return None, None
-def process_uploaded_files(uploaded_files, embeddings, pc, index_name):
-    """Process uploaded PDF files and store in vector database"""
     if not uploaded_files:
         return None, []
@@ -145,6 +128,11 @@ def process_uploaded_files(uploaded_files, embeddings, pc, index_name):
             # Load PDF
             loader = PyPDFLoader(tmp_file_path)
             docs = loader.load()
             documents.extend(docs)
             # Clean up temporary file
@@ -172,28 +160,23 @@ def process_uploaded_files(uploaded_files, embeddings, pc, index_name):
     for i, text in enumerate(text_chunks):
         text.metadata.update({
             "chunk_id": i,
-            "source_file": text.metadata.get("source", "unknown"),
             "chunk_size": len(text.page_content)
         })
     st.info(f"✂️ Created {len(text_chunks)} text chunks")
-    # Store in Pinecone
     try:
-        vectorstore = PineconeVectorStore.from_documents(
-            documents=text_chunks,
-            embedding=embeddings,
-            index_name=index_name
-        )
-        st.success(f"✅ Successfully stored {len(text_chunks)} chunks in vector database!")
         return vectorstore, text_chunks
     except Exception as e:
-        st.error(f"❌ Error storing in vector database: {e}")
         return None, []
 def create_qa_chain(llm, vectorstore, k=5):
     """Create a question-answering chain with retrieval"""
-    if not vectorstore:
         return None
     prompt_template = """Use the following context to answer the question. If you cannot find the answer in the context, say "I cannot find this information in the provided documents."
@@ -209,16 +192,18 @@ Answer: Let me analyze the provided context to answer your question."""
         input_variables=["context", "question"]
     )
-    qa_chain = RetrievalQA.from_chain_type(
-        llm=llm,
-        chain_type="stuff",
-        retriever=vectorstore.as_retriever(search_kwargs={"k": k}),
-        chain_type_kwargs={"prompt": PROMPT},
-        return_source_documents=True,
-        verbose=True
-    )
-    return qa_chain
 def ask_question(qa_chain, question):
     """Ask a question and get an answer with sources"""
@@ -240,6 +225,18 @@ def ask_question(qa_chain, question):
         st.error(f"❌ Error processing question: {e}")
         return None
 # Main App Interface
 def main():
     st.markdown('<h1 class="main-header">📚 PDF RAG System</h1>', unsafe_allow_html=True)
@@ -249,21 +246,6 @@ def main():
     with st.sidebar:
         st.markdown('<h2 class="sidebar-header">⚙️ Configuration</h2>', unsafe_allow_html=True)
-        # Pinecone configuration
-        st.subheader("🌲 Pinecone Settings")
-        pinecone_api_key = st.text_input(
-            "Pinecone API Key",
-            type="password",
-            help="Enter your Pinecone API key",
-            value=st.secrets.get("PINECONE_API_KEY", "") if "PINECONE_API_KEY" in st.secrets else ""
-        )
-        index_name = st.text_input(
-            "Index Name",
-            value="pdf-rag-system",
-            help="Name for your Pinecone index"
-        )
         # Model configuration
         st.subheader("🤖 Model Settings")
         llm_model = st.selectbox(
@@ -274,7 +256,7 @@ def main():
         embedding_model = st.selectbox(
             "Embedding Model",
-            ["all-MiniLM-L6-v2", "all-mpnet-base-v2"],
             help="Choose the embedding model"
         )
@@ -285,6 +267,19 @@ def main():
             value=5,
             help="How many relevant chunks to use for answering questions"
         )
     # Main content area
     col1, col2 = st.columns([1, 1])
@@ -301,34 +296,32 @@ def main():
         if st.button("🚀 Process Documents", type="primary"):
             if not uploaded_files:
                 st.warning("Please upload at least one PDF file.")
-            elif not pinecone_api_key:
-                st.warning("Please enter your Pinecone API key.")
             else:
                 with st.spinner("Processing documents..."):
                     # Setup models
                     llm = setup_llm(llm_model)
                     embeddings = setup_embeddings(embedding_model)
-                    # Setup Pinecone
-                    pc, idx_name = setup_pinecone(pinecone_api_key, index_name=index_name)
-                    if pc:
                         # Process files
-                        vectorstore, text_chunks = process_uploaded_files(
-                            uploaded_files, embeddings, pc, idx_name
-                        )
                         if vectorstore:
                             # Create QA chain
                             qa_chain = create_qa_chain(llm, vectorstore, k=retrieval_k)
-                            # Store in session state
-                            st.session_state.qa_chain = qa_chain
-                            st.session_state.vectorstore = vectorstore
-                            st.session_state.documents_processed = True
-                            st.balloons()
-                            st.success("🎉 Documents processed successfully! You can now ask questions.")
     with col2:
         st.subheader("💬 Ask Questions")
@@ -340,31 +333,46 @@ def main():
                 help="Ask any question about your uploaded documents"
             )
-            if st.button("🔍 Get Answer"):
-                if question:
-                    with st.spinner("Searching for answer..."):
-                        result = ask_question(st.session_state.qa_chain, question)
-                        if result:
-                            # Add to chat history
-                            st.session_state.chat_history.append({
-                                "question": question,
-                                "answer": result["answer"],
-                                "sources": result["source_documents"]
-                            })
-                            # Display answer
-                            st.subheader("💡 Answer:")
-                            st.write(result["answer"])
-                            # Display sources
-                            if result["source_documents"]:
-                                st.subheader("📚 Sources:")
-                                for i, doc in enumerate(result["source_documents"][:3]):
-                                    with st.expander(f"Source {i+1}: {doc.metadata.get('source', 'Unknown')}"):
-                                        st.write(doc.page_content[:500] + "..." if len(doc.page_content) > 500 else doc.page_content)
-                else:
-                    st.warning("Please enter a question.")
         else:
             st.info("👆 Please upload and process documents first to start asking questions.")
@@ -380,7 +388,7 @@ def main():
                 if chat['sources']:
                     st.write("**Sources:**")
                     for j, doc in enumerate(chat['sources'][:2]):  # Show top 2 sources
-                        st.write(f"{j+1}. {doc.metadata.get('source', 'Unknown')}")
     # Clear session button
     if st.session_state.documents_processed:
@@ -390,7 +398,7 @@ def main():
             st.session_state.documents_processed = False
             st.session_state.chat_history = []
             st.success("Session cleared! You can upload new documents.")
-            st.experimental_rerun()
 if __name__ == "__main__":
     main()

 import os
 import tempfile
 from typing import List, Optional
+import pickle
 # Core libraries
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
 from langchain.schema import Document
 from langchain import PromptTemplate
 from langchain.chains import RetrievalQA
+from langchain.vectorstores import FAISS
 # Document loaders
 from langchain.document_loaders import PyPDFLoader
 def setup_llm(model_name="google/flan-t5-small"):
     """Setup the language model for text generation"""
     with st.spinner("🤖 Loading language model..."):
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+            pipe = pipeline(
+                "text2text-generation",
+                model=model,
+                tokenizer=tokenizer,
+                max_new_tokens=300,
+                temperature=0.3,
+                do_sample=True,
+                device_map="auto" if st.secrets.get("DEVICE", "cpu") == "gpu" else None
+            )
+            llm = HuggingFacePipeline(pipeline=pipe)
+            return llm
+        except Exception as e:
+            st.error(f"Error loading model: {e}")
+            return None
 @st.cache_resource
 def setup_embeddings(model_name="all-MiniLM-L6-v2"):
     """Setup the embedding model for vector generation"""
     with st.spinner("🔢 Loading embedding model..."):
+        try:
+            embeddings = HuggingFaceEmbeddings(model_name=model_name)
+            return embeddings
+        except Exception as e:
+            st.error(f"Error loading embeddings: {e}")
+            return None
+def process_uploaded_files(uploaded_files, embeddings):
+    """Process uploaded PDF files and create FAISS vector store"""
     if not uploaded_files:
         return None, []
             # Load PDF
             loader = PyPDFLoader(tmp_file_path)
             docs = loader.load()
+            # Add file name to metadata
+            for doc in docs:
+                doc.metadata['source_file'] = uploaded_file.name
             documents.extend(docs)
             # Clean up temporary file
     for i, text in enumerate(text_chunks):
         text.metadata.update({
             "chunk_id": i,
             "chunk_size": len(text.page_content)
         })
     st.info(f"✂️ Created {len(text_chunks)} text chunks")
+    # Create FAISS vector store
     try:
+        vectorstore = FAISS.from_documents(text_chunks, embeddings)
+        st.success(f"✅ Successfully created vector database with {len(text_chunks)} chunks!")
         return vectorstore, text_chunks
     except Exception as e:
+        st.error(f"❌ Error creating vector database: {e}")
         return None, []
 def create_qa_chain(llm, vectorstore, k=5):
     """Create a question-answering chain with retrieval"""
+    if not vectorstore or not llm:
         return None
     prompt_template = """Use the following context to answer the question. If you cannot find the answer in the context, say "I cannot find this information in the provided documents."
         input_variables=["context", "question"]
     )
+    try:
+        qa_chain = RetrievalQA.from_chain_type(
+            llm=llm,
+            chain_type="stuff",
+            retriever=vectorstore.as_retriever(search_kwargs={"k": k}),
+            chain_type_kwargs={"prompt": PROMPT},
+            return_source_documents=True
+        )
+        return qa_chain
+    except Exception as e:
+        st.error(f"Error creating QA chain: {e}")
+        return None
 def ask_question(qa_chain, question):
     """Ask a question and get an answer with sources"""
         st.error(f"❌ Error processing question: {e}")
         return None
+def search_similar_chunks(vectorstore, query, k=5):
+    """Search for similar chunks without generating an answer"""
+    if not vectorstore:
+        return []
+    try:
+        results = vectorstore.similarity_search(query, k=k)
+        return results
+    except Exception as e:
+        st.error(f"Error searching: {e}")
+        return []
 # Main App Interface
 def main():
     st.markdown('<h1 class="main-header">📚 PDF RAG System</h1>', unsafe_allow_html=True)
     with st.sidebar:
         st.markdown('<h2 class="sidebar-header">⚙️ Configuration</h2>', unsafe_allow_html=True)
         # Model configuration
         st.subheader("🤖 Model Settings")
         llm_model = st.selectbox(
         embedding_model = st.selectbox(
             "Embedding Model",
+            ["all-MiniLM-L6-v2", "sentence-transformers/all-mpnet-base-v2"],
             help="Choose the embedding model"
         )
             value=5,
             help="How many relevant chunks to use for answering questions"
         )
+        st.subheader("💾 Vector Store")
+        st.info("Using FAISS (local vector storage)")
+        # Option to save/load vector store
+        if st.session_state.vectorstore:
+            if st.button("💾 Save Vector Store"):
+                try:
+                    # Save vector store to session state or file
+                    st.session_state.vectorstore.save_local("faiss_index")
+                    st.success("Vector store saved!")
+                except Exception as e:
+                    st.error(f"Error saving: {e}")
     # Main content area
     col1, col2 = st.columns([1, 1])
         if st.button("🚀 Process Documents", type="primary"):
             if not uploaded_files:
                 st.warning("Please upload at least one PDF file.")
             else:
                 with st.spinner("Processing documents..."):
                     # Setup models
                     llm = setup_llm(llm_model)
                     embeddings = setup_embeddings(embedding_model)
+                    if llm and embeddings:
                         # Process files
+                        vectorstore, text_chunks = process_uploaded_files(uploaded_files, embeddings)
                         if vectorstore:
                             # Create QA chain
                             qa_chain = create_qa_chain(llm, vectorstore, k=retrieval_k)
+                            if qa_chain:
+                                # Store in session state
+                                st.session_state.qa_chain = qa_chain
+                                st.session_state.vectorstore = vectorstore
+                                st.session_state.documents_processed = True
+                                st.balloons()
+                                st.success("🎉 Documents processed successfully! You can now ask questions.")
+                            else:
+                                st.error("Failed to create QA chain.")
+                    else:
+                        st.error("Failed to load models.")
     with col2:
         st.subheader("💬 Ask Questions")
                 help="Ask any question about your uploaded documents"
             )
+            col2a, col2b = st.columns([1, 1])
+            with col2a:
+                if st.button("🔍 Get Answer"):
+                    if question:
+                        with st.spinner("Searching for answer..."):
+                            result = ask_question(st.session_state.qa_chain, question)
+                            if result:
+                                # Add to chat history
+                                st.session_state.chat_history.append({
+                                    "question": question,
+                                    "answer": result["answer"],
+                                    "sources": result["source_documents"]
+                                })
+                                # Display answer
+                                st.subheader("💡 Answer:")
+                                st.write(result["answer"])
+                                # Display sources
+                                if result["source_documents"]:
+                                    st.subheader("📚 Sources:")
+                                    for i, doc in enumerate(result["source_documents"][:3]):
+                                        with st.expander(f"Source {i+1}: {doc.metadata.get('source_file', 'Unknown')}"):
+                                            st.write(doc.page_content[:500] + "..." if len(doc.page_content) > 500 else doc.page_content)
+                    else:
+                        st.warning("Please enter a question.")
+            with col2b:
+                if st.button("🔍 Search Similar"):
+                    if question:
+                        with st.spinner("Searching for similar content..."):
+                            results = search_similar_chunks(st.session_state.vectorstore, question, k=5)
+                            if results:
+                                st.subheader("🔍 Similar Content:")
+                                for i, doc in enumerate(results):
+                                    with st.expander(f"Match {i+1}: {doc.metadata.get('source_file', 'Unknown')}"):
+                                        st.write(doc.page_content[:300] + "..." if len(doc.page_content) > 300 else doc.page_content)
         else:
             st.info("👆 Please upload and process documents first to start asking questions.")
                 if chat['sources']:
                     st.write("**Sources:**")
                     for j, doc in enumerate(chat['sources'][:2]):  # Show top 2 sources
+                        st.write(f"{j+1}. {doc.metadata.get('source_file', 'Unknown')}")
     # Clear session button
     if st.session_state.documents_processed:
             st.session_state.documents_processed = False
             st.session_state.chat_history = []
             st.success("Session cleared! You can upload new documents.")
+            st.rerun()
 if __name__ == "__main__":
     main()