PDF_QA_Streamlit_App_3

Build error

App Files Files Community

rbbist commited on Jun 28, 2025

Commit

0446596

verified ·

1 Parent(s): b1e9890

Update app.py

Browse files

Files changed (1) hide show

app.py +389 -40

app.py CHANGED Viewed

@@ -1,47 +1,396 @@
 import streamlit as st
-import PyPDF2
-from langchain.embeddings import SentenceTransformerEmbeddings
-from langchain.vectorstores import FAISS
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.chains import RetrievalQA
-from langchain.llms import HuggingFacePipeline
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
-st.set_page_config(page_title="PDF QA App (CPU)", layout="wide")
-st.title("📘 Ask Questions from Uploaded PDFs (Free & CPU Friendly)")
-uploaded_files = st.file_uploader("Upload multiple PDF files", type=["pdf"], accept_multiple_files=True)
 @st.cache_resource
-def load_llm():
-    model_id = "google/flan-t5-base"
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
-    pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
-    return HuggingFacePipeline(pipeline=pipe)
-if uploaded_files:
-    st.info("Reading and processing PDFs...")
-    all_text = ""
-    for file in uploaded_files:
-        reader = PyPDF2.PdfReader(file)
-        for page in reader.pages:
-            text = page.extract_text()
-            if text:
-                all_text += text
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
-    texts = text_splitter.split_text(all_text)
-    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
-    db = FAISS.from_texts(texts, embeddings)
-    retriever = db.as_retriever()
-    llm = load_llm()
-    qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
-    question = st.text_input("Ask a question based on the uploaded PDFs:")
-    if question:
-        with st.spinner("Generating answer..."):
-            answer = qa_chain.run(question)
-            st.success(answer)

 import streamlit as st
+import os
+import tempfile
+from typing import List, Optional
+from pathlib import Path
+import time
+# Core libraries
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
+from langchain.llms import HuggingFacePipeline
+from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.schema import Document
+from langchain import PromptTemplate
 from langchain.chains import RetrievalQA
+from langchain_pinecone import PineconeVectorStore
+from pinecone import Pinecone as PineconeClient
+# Document loaders
+from langchain.document_loaders import PyPDFLoader
+# Configure Streamlit page
+st.set_page_config(
+    page_title="PDF RAG System",
+    page_icon="📚",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Custom CSS for better styling
+st.markdown("""
+<style>
+    .main-header {
+        font-size: 2.5rem;
+        color: #1f77b4;
+        text-align: center;
+        margin-bottom: 2rem;
+    }
+    .sidebar-header {
+        font-size: 1.5rem;
+        color: #ff7f0e;
+        margin-bottom: 1rem;
+    }
+    .success-message {
+        padding: 1rem;
+        background-color: #d4edda;
+        border: 1px solid #c3e6cb;
+        border-radius: 0.5rem;
+        color: #155724;
+        margin: 1rem 0;
+    }
+    .error-message {
+        padding: 1rem;
+        background-color: #f8d7da;
+        border: 1px solid #f5c6cb;
+        border-radius: 0.5rem;
+        color: #721c24;
+        margin: 1rem 0;
+    }
+    .source-box {
+        background-color: #f8f9fa;
+        border-left: 4px solid #007bff;
+        padding: 1rem;
+        margin: 0.5rem 0;
+        border-radius: 0 0.5rem 0.5rem 0;
+    }
+</style>
+""", unsafe_allow_html=True)
+# Initialize session state
+if 'qa_chain' not in st.session_state:
+    st.session_state.qa_chain = None
+if 'vectorstore' not in st.session_state:
+    st.session_state.vectorstore = None
+if 'documents_processed' not in st.session_state:
+    st.session_state.documents_processed = False
+if 'chat_history' not in st.session_state:
+    st.session_state.chat_history = []
 @st.cache_resource
+def setup_llm(model_name="google/flan-t5-small"):
+    """Setup the language model for text generation"""
+    with st.spinner("🤖 Loading language model..."):
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+        pipe = pipeline(
+            "text2text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            max_new_tokens=300,
+            temperature=0.3,
+            do_sample=True
+        )
+        llm = HuggingFacePipeline(pipeline=pipe)
+        return llm
+@st.cache_resource
+def setup_embeddings(model_name="all-MiniLM-L6-v2"):
+    """Setup the embedding model for vector generation"""
+    with st.spinner("🔢 Loading embedding model..."):
+        embeddings = HuggingFaceEmbeddings(model_name=model_name)
+        return embeddings
+def setup_pinecone(api_key, environment="us-east-1", index_name="pdf-rag-system"):
+    """Setup Pinecone vector database connection"""
+    try:
+        os.environ["PINECONE_API_KEY"] = api_key
+        os.environ["PINECONE_ENVIRONMENT"] = environment
+        pc = PineconeClient(api_key=api_key, environment=environment)
+        existing_indexes = pc.list_indexes()
+        if index_name not in [idx.name for idx in existing_indexes]:
+            st.info(f"📝 Creating new index: {index_name}")
+            pc.create_index(
+                name=index_name,
+                dimension=384,
+                metric='cosine'
+            )
+            time.sleep(30)  # Wait for index to be ready
+        return pc, index_name
+    except Exception as e:
+        st.error(f"❌ Error setting up Pinecone: {e}")
+        return None, None
+def process_uploaded_files(uploaded_files, embeddings, pc, index_name):
+    """Process uploaded PDF files and store in vector database"""
+    if not uploaded_files:
+        return None, []
+    documents = []
+    # Process each uploaded file
+    for uploaded_file in uploaded_files:
+        try:
+            # Create temporary file
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
+                tmp_file.write(uploaded_file.read())
+                tmp_file_path = tmp_file.name
+            # Load PDF
+            loader = PyPDFLoader(tmp_file_path)
+            docs = loader.load()
+            documents.extend(docs)
+            # Clean up temporary file
+            os.unlink(tmp_file_path)
+            st.success(f"✅ Processed: {uploaded_file.name} ({len(docs)} pages)")
+        except Exception as e:
+            st.error(f"❌ Error processing {uploaded_file.name}: {e}")
+    if not documents:
+        return None, []
+    # Split documents into chunks
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=200,
+        length_function=len,
+        separators=["\n\n", "\n", " ", ""]
+    )
+    text_chunks = text_splitter.split_documents(documents)
+    # Add metadata to chunks
+    for i, text in enumerate(text_chunks):
+        text.metadata.update({
+            "chunk_id": i,
+            "source_file": text.metadata.get("source", "unknown"),
+            "chunk_size": len(text.page_content)
+        })
+    st.info(f"✂️ Created {len(text_chunks)} text chunks")
+    # Store in Pinecone
+    try:
+        vectorstore = PineconeVectorStore.from_documents(
+            documents=text_chunks,
+            embedding=embeddings,
+            index_name=index_name
+        )
+        st.success(f"✅ Successfully stored {len(text_chunks)} chunks in vector database!")
+        return vectorstore, text_chunks
+    except Exception as e:
+        st.error(f"❌ Error storing in vector database: {e}")
+        return None, []
+def create_qa_chain(llm, vectorstore, k=5):
+    """Create a question-answering chain with retrieval"""
+    if not vectorstore:
+        return None
+    prompt_template = """Use the following context to answer the question. If you cannot find the answer in the context, say "I cannot find this information in the provided documents."
+Context: {context}
+Question: {question}
+Answer: Let me analyze the provided context to answer your question."""
+    PROMPT = PromptTemplate(
+        template=prompt_template,
+        input_variables=["context", "question"]
+    )
+    qa_chain = RetrievalQA.from_chain_type(
+        llm=llm,
+        chain_type="stuff",
+        retriever=vectorstore.as_retriever(search_kwargs={"k": k}),
+        chain_type_kwargs={"prompt": PROMPT},
+        return_source_documents=True,
+        verbose=True
+    )
+    return qa_chain
+def ask_question(qa_chain, question):
+    """Ask a question and get an answer with sources"""
+    if not qa_chain:
+        return None
+    try:
+        result = qa_chain({"query": question})
+        response = {
+            "question": question,
+            "answer": result["result"],
+            "source_documents": result.get("source_documents", [])
+        }
+        return response
+    except Exception as e:
+        st.error(f"❌ Error processing question: {e}")
+        return None
+# Main App Interface
+def main():
+    st.markdown('<h1 class="main-header">📚 PDF RAG System</h1>', unsafe_allow_html=True)
+    st.markdown("Upload PDF documents and ask questions about their content using AI-powered retrieval!")
+    # Sidebar for configuration
+    with st.sidebar:
+        st.markdown('<h2 class="sidebar-header">⚙️ Configuration</h2>', unsafe_allow_html=True)
+        # Pinecone configuration
+        st.subheader("🌲 Pinecone Settings")
+        pinecone_api_key = st.text_input(
+            "Pinecone API Key",
+            type="password",
+            help="Enter your Pinecone API key",
+            value=st.secrets.get("PINECONE_API_KEY", "") if "PINECONE_API_KEY" in st.secrets else ""
+        )
+        index_name = st.text_input(
+            "Index Name",
+            value="pdf-rag-system",
+            help="Name for your Pinecone index"
+        )
+        # Model configuration
+        st.subheader("🤖 Model Settings")
+        llm_model = st.selectbox(
+            "Language Model",
+            ["google/flan-t5-small", "google/flan-t5-base"],
+            help="Choose the language model (smaller models are faster)"
+        )
+        embedding_model = st.selectbox(
+            "Embedding Model",
+            ["all-MiniLM-L6-v2", "all-mpnet-base-v2"],
+            help="Choose the embedding model"
+        )
+        retrieval_k = st.slider(
+            "Number of chunks to retrieve",
+            min_value=1,
+            max_value=10,
+            value=5,
+            help="How many relevant chunks to use for answering questions"
+        )
+    # Main content area
+    col1, col2 = st.columns([1, 1])
+    with col1:
+        st.subheader("📁 Upload Documents")
+        uploaded_files = st.file_uploader(
+            "Choose PDF files",
+            type=['pdf'],
+            accept_multiple_files=True,
+            help="Upload one or more PDF files to analyze"
+        )
+        if st.button("🚀 Process Documents", type="primary"):
+            if not uploaded_files:
+                st.warning("Please upload at least one PDF file.")
+            elif not pinecone_api_key:
+                st.warning("Please enter your Pinecone API key.")
+            else:
+                with st.spinner("Processing documents..."):
+                    # Setup models
+                    llm = setup_llm(llm_model)
+                    embeddings = setup_embeddings(embedding_model)
+                    # Setup Pinecone
+                    pc, idx_name = setup_pinecone(pinecone_api_key, index_name=index_name)
+                    if pc:
+                        # Process files
+                        vectorstore, text_chunks = process_uploaded_files(
+                            uploaded_files, embeddings, pc, idx_name
+                        )
+                        if vectorstore:
+                            # Create QA chain
+                            qa_chain = create_qa_chain(llm, vectorstore, k=retrieval_k)
+                            # Store in session state
+                            st.session_state.qa_chain = qa_chain
+                            st.session_state.vectorstore = vectorstore
+                            st.session_state.documents_processed = True
+                            st.balloons()
+                            st.success("🎉 Documents processed successfully! You can now ask questions.")
+    with col2:
+        st.subheader("💬 Ask Questions")
+        if st.session_state.documents_processed:
+            question = st.text_input(
+                "Your question:",
+                placeholder="What are the main topics discussed in the documents?",
+                help="Ask any question about your uploaded documents"
+            )
+            if st.button("🔍 Get Answer"):
+                if question:
+                    with st.spinner("Searching for answer..."):
+                        result = ask_question(st.session_state.qa_chain, question)
+                        if result:
+                            # Add to chat history
+                            st.session_state.chat_history.append({
+                                "question": question,
+                                "answer": result["answer"],
+                                "sources": result["source_documents"]
+                            })
+                            # Display answer
+                            st.subheader("💡 Answer:")
+                            st.write(result["answer"])
+                            # Display sources
+                            if result["source_documents"]:
+                                st.subheader("📚 Sources:")
+                                for i, doc in enumerate(result["source_documents"][:3]):
+                                    with st.expander(f"Source {i+1}: {doc.metadata.get('source', 'Unknown')}"):
+                                        st.write(doc.page_content[:500] + "..." if len(doc.page_content) > 500 else doc.page_content)
+                else:
+                    st.warning("Please enter a question.")
+        else:
+            st.info("👆 Please upload and process documents first to start asking questions.")
+    # Chat History
+    if st.session_state.chat_history:
+        st.subheader("📝 Chat History")
+        for i, chat in enumerate(reversed(st.session_state.chat_history[-5:])):  # Show last 5
+            with st.expander(f"Q: {chat['question'][:50]}..."):
+                st.write("**Question:**", chat['question'])
+                st.write("**Answer:**", chat['answer'])
+                if chat['sources']:
+                    st.write("**Sources:**")
+                    for j, doc in enumerate(chat['sources'][:2]):  # Show top 2 sources
+                        st.write(f"{j+1}. {doc.metadata.get('source', 'Unknown')}")
+    # Clear session button
+    if st.session_state.documents_processed:
+        if st.button("🗑️ Clear Session"):
+            st.session_state.qa_chain = None
+            st.session_state.vectorstore = None
+            st.session_state.documents_processed = False
+            st.session_state.chat_history = []
+            st.success("Session cleared! You can upload new documents.")
+            st.experimental_rerun()
+if __name__ == "__main__":
+    main()