Spaces:

RAVENOCC
/

Multi-PDF-Chatbot

Sleeping

App Files Files Community

RAVENOCC commited on Jun 16, 2025

Commit

172fd5d

verified ·

1 Parent(s): 8a4bd37

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +282 -38

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,284 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

 import streamlit as st
+from dotenv import load_dotenv
+from PyPDF2 import PdfReader
+from langchain_groq import ChatGroq
+from langchain.text_splitter import CharacterTextSplitter
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain_community.llms import HuggingFaceHub
+from langchain.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnablePassthrough
+from htmlTemplates import css, bot_template, user_template
+import os
+def get_pdf_text(pdf_docs):
+    text = ""
+    for pdf in pdf_docs:
+        pdf_reader = PdfReader(pdf)
+        for page in pdf_reader.pages:
+            text += page.extract_text()
+    return text
+def get_text_chunks(text):
+    text_splitter = CharacterTextSplitter(
+        separator="\n",
+        chunk_size=1000,
+        chunk_overlap=200,
+        length_function=len
+    )
+    chunks = text_splitter.split_text(text)
+    return chunks
+def get_vector_store(text_chunks):
+    try:
+        model_name = "BAAI/bge-small-en"
+        model_kwargs = {'device': 'cpu'}
+        encode_kwargs = {"normalize_embeddings": True}
+        embeddings = HuggingFaceEmbeddings(
+            model_name=model_name,
+            model_kwargs=model_kwargs,
+            encode_kwargs=encode_kwargs,
+            cache_folder="/tmp/huggingface_cache"
+        )
+        vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
+        return vectorstore
+    except Exception as e:
+        st.error(f"Error creating vector store: {str(e)}")
+        return None
+def get_conversation_chain(vectorstore, api_key):
+    if not api_key:
+        st.error("Please provide a valid Groq API key.")
+        return None
+    try:
+        # Set the API key in environment for this session
+        os.environ["GROQ_API_KEY"] = api_key
+        llm = ChatGroq(
+            model="llama3-8b-8192",
+            temperature=0,
+            api_key=api_key
+        )
+        # Create the prompt template
+        prompt = ChatPromptTemplate.from_messages([
+            ("system", """You are a helpful assistant answering questions based on the provided documents.
+            Answer the question using only the context provided.
+            If you don't know the answer, just say that you don't know, don't try to make up an answer.
+            Keep your answers focused and relevant to the question."""),
+            ("human", """Context: {context}
+Question: {question}
+Answer: """)
+        ])
+        # Create the retrieval chain
+        retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
+        # Define the chain
+        chain = (
+            {"context": retriever, "question": RunnablePassthrough()}
+            | prompt
+            | llm
+            | StrOutputParser()
+        )
+        return chain
+    except Exception as e:
+        st.error(f"Failed to initialize Groq model: {str(e)}")
+        st.info("Please check if your API key is valid. Get your API key from: https://console.groq.com/keys")
+        return None
+def handle_user_input(user_question):
+    if st.session_state.conversation is None:
+        st.warning("Please upload and process documents first.")
+        return
+    try:
+        # Invoke the chain with the question
+        response = st.session_state.conversation.invoke(user_question)
+        # Update chat history
+        if 'chat_history' not in st.session_state:
+            st.session_state.chat_history = []
+        # Add the new messages to chat history
+        st.session_state.chat_history.append(("user", user_question))
+        st.session_state.chat_history.append(("bot", response))
+        # Display chat history
+        for sender, message in st.session_state.chat_history:
+            if sender == "user":
+                st.write(user_template.replace("{{MSG}}", message), unsafe_allow_html=True)
+            else:
+                st.write(bot_template.replace("{{MSG}}", message), unsafe_allow_html=True)
+    except Exception as e:
+        st.error(f"An error occurred while processing your question: {str(e)}")
+        st.info("This might be due to an invalid API key or network issues.")
+def main():
+    load_dotenv()
+    # Set environment variables for HuggingFace cache
+    os.environ['HUGGINGFACE_HUB_CACHE'] = '/tmp/huggingface_cache'
+    os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface_cache'
+    # Create cache directory
+    os.makedirs('/tmp/huggingface_cache', exist_ok=True)
+    if 'user_template' not in globals():
+        global user_template
+        user_template = '''
+        <div class="chat-message user">
+            <div class="avatar">
+                <img src="https://i.ibb.co/rdZC7LZ/user.png">
+            </div>
+            <div class="message">{{MSG}}</div>
+        </div>
+        '''
+    if 'bot_template' not in globals():
+        global bot_template
+        bot_template = '''
+        <div class="chat-message bot">
+            <div class="avatar">
+                <img src="https://i.ibb.co/cN0nmSj/robot.png">
+            </div>
+            <div class="message">{{MSG}}</div>
+        </div>
+        '''
+    st.set_page_config(page_title='Chat with PDFs', page_icon=":books:")
+    st.write(css, unsafe_allow_html=True)
+    # Initialize session state
+    if "conversation" not in st.session_state:
+        st.session_state.conversation = None
+    if "chat_history" not in st.session_state:
+        st.session_state.chat_history = []
+    if "groq_api_key" not in st.session_state:
+        st.session_state.groq_api_key = ""
+    st.header('PDF ChatBot 📚')
+    # API Key Input Section
+    st.sidebar.header("🔑 API Configuration")
+    # API Key input
+    groq_api_key = st.sidebar.text_input(
+        "Enter your Groq API Key:",
+        type="password",
+        value=st.session_state.groq_api_key,
+        help="Get your free API key from https://console.groq.com/keys"
+    )
+    # Update session state
+    if groq_api_key:
+        st.session_state.groq_api_key = groq_api_key
+        st.sidebar.success("✅ API Key provided!")
+    else:
+        st.sidebar.warning("⚠️ Please enter your Groq API key to continue.")
+        st.sidebar.info("Get your free API key from: https://console.groq.com/keys")
+    st.sidebar.markdown("---")
+    # Sidebar for PDF upload
+    st.sidebar.subheader("📄 Upload Documents")
+    pdf_docs = st.sidebar.file_uploader(
+        "Upload your PDFs here and click 'Process'",
+        accept_multiple_files=True,
+        type=['pdf']
+    )
+    # Process button
+    if st.sidebar.button('🚀 Process Documents'):
+        if not groq_api_key:
+            st.sidebar.error("❌ Please enter your Groq API key first!")
+            st.error("Please provide your Groq API key in the sidebar to continue.")
+            return
+        if not pdf_docs:
+            st.sidebar.warning("📋 Please upload at least one PDF document.")
+            return
+        with st.spinner("Processing documents... This may take a few minutes for the first run."):
+            try:
+                # Get PDF text
+                raw_text = get_pdf_text(pdf_docs)
+                if not raw_text.strip():
+                    st.error("❌ No text could be extracted from the PDFs. Please check if the PDFs contain readable text.")
+                    return
+                st.info(f"✅ Extracted {len(raw_text)} characters from {len(pdf_docs)} PDF(s)")
+                # Get text chunks
+                text_chunks = get_text_chunks(raw_text)
+                st.info(f"✅ Created {len(text_chunks)} text chunks")
+                # Create vector store
+                with st.spinner("Creating embeddings..."):
+                    vectorstore = get_vector_store(text_chunks)
+                if vectorstore is None:
+                    st.error("❌ Failed to create vector store. Please try again.")
+                    return
+                st.info("✅ Vector store created successfully")
+                # Create conversation chain
+                with st.spinner("Initializing conversation chain..."):
+                    conversation = get_conversation_chain(vectorstore, groq_api_key)
+                if conversation is None:
+                    st.error("❌ Failed to create conversation chain. Please check your API key.")
+                    return
+                st.session_state.conversation = conversation
+                st.success("🎉 Documents processed successfully! You can now ask questions.")
+            except Exception as e:
+                st.error(f"❌ An error occurred: {str(e)}")
+                st.info("Please check your API key and try again.")
+    # Main chat interface
+    st.subheader("💬 Ask Questions About Your Documents")
+    if not groq_api_key:
+        st.info("👆 Please enter your Groq API key in the sidebar to get started.")
+        st.info("🔗 Get your free API key from: https://console.groq.com/keys")
+    elif st.session_state.conversation is None:
+        st.info("📤 Upload and process your PDF documents using the sidebar to start chatting.")
+    else:
+        user_question = st.text_input(
+            "Your question:",
+            placeholder="Ask anything about your uploaded documents..."
+        )
+        if user_question:
+            handle_user_input(user_question)
+    # Display instructions
+    if not groq_api_key or st.session_state.conversation is None:
+        st.markdown("---")
+        st.markdown("### 📋 How to Use:")
+        st.markdown("""
+        1. **Get API Key**: Visit [Groq Console](https://console.groq.com/keys) to get your free API key
+        2. **Enter API Key**: Paste your API key in the sidebar
+        3. **Upload PDFs**: Upload one or more PDF documents
+        4. **Process**: Click 'Process Documents' to analyze your PDFs
+        5. **Chat**: Ask questions about your documents!
+        """)
+if __name__ == "__main__":
+    main()