Spaces:

ashok2216
/

pdf-chatbot

Sleeping

App Files Files Community

ashok2216 commited on Nov 20, 2024

Commit

e5d1312

verified ·

1 Parent(s): 52da32c

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -14

app.py CHANGED Viewed

@@ -3,16 +3,15 @@ from chromadb.utils import embedding_functions
 from chromadb.config import Settings
 from transformers import pipeline
 import streamlit as st
-import fitz  # PyMuPDF for PDF parsing
 from PIL import Image
-# Configure ChromaDB with persistent SQLite database
 config = Settings(
     persist_directory="./chromadb_data",
     chroma_db_impl="sqlite",
 )
-# Initialize persistent client with SQLite
 def setup_chromadb():
     client = chromadb.PersistentClient(path="./chromadb_data")
     collection = client.get_or_create_collection(
@@ -23,9 +22,8 @@ def setup_chromadb():
     )
     return client, collection
-# Clear the collection
 def clear_collection(client, collection_name):
-    # Delete the collection and recreate it
     client.delete_collection(name=collection_name)
     return client.get_or_create_collection(
         name=collection_name,
@@ -42,9 +40,9 @@ def extract_text_from_pdf(uploaded_file):
         return text
 def add_pdf_text_to_db(collection, pdf_text):
-    sentences = pdf_text.split("\n")  # Split text into lines for granularity
     for idx, sentence in enumerate(sentences):
-        if sentence.strip():  # Avoid empty lines
             collection.add(
                 ids=[f"pdf_text_{idx}"],
                 documents=[sentence],
@@ -61,27 +59,27 @@ def query_pdf_data(collection, query, retriever_model):
     answer = retriever_model(f"Context: {context}\nQuestion: {query}")
     return answer, results["metadatas"]
-# Streamlit Interface
 def main():
     image = Image.open('LOGO.PNG')
     st.image(
     image, width=250)
-    st.title("PDF Chatbot with Retrieval-Augmented Generation")
     st.write("Upload a PDF, and ask questions about its content!")
-    # Initialize components
     client, collection = setup_chromadb()
-    retriever_model = pipeline("text2text-generation", model="google/flan-t5-small")  # Free LLM
     # File upload
     uploaded_file = st.file_uploader("Upload your PDF file", type="pdf")
     if uploaded_file:
         try:
-            # Clear existing data
             collection = clear_collection(client, "pdf_data")
             st.info("Existing data cleared from the database.")
-            # Extract and add new data
             pdf_text = extract_text_from_pdf(uploaded_file)
             st.success("Text extracted successfully!")
             st.text_area("Extracted Text:", pdf_text, height=300)
@@ -108,4 +106,3 @@ def main():
 if __name__ == "__main__":
     main()

 from chromadb.config import Settings
 from transformers import pipeline
 import streamlit as st
+import fitz
 from PIL import Image
 config = Settings(
     persist_directory="./chromadb_data",
     chroma_db_impl="sqlite",
 )
 def setup_chromadb():
     client = chromadb.PersistentClient(path="./chromadb_data")
     collection = client.get_or_create_collection(
     )
     return client, collection
 def clear_collection(client, collection_name):
     client.delete_collection(name=collection_name)
     return client.get_or_create_collection(
         name=collection_name,
         return text
 def add_pdf_text_to_db(collection, pdf_text):
+    sentences = pdf_text.split("\n")
     for idx, sentence in enumerate(sentences):
+        if sentence.strip():
             collection.add(
                 ids=[f"pdf_text_{idx}"],
                 documents=[sentence],
     answer = retriever_model(f"Context: {context}\nQuestion: {query}")
     return answer, results["metadatas"]
 def main():
     image = Image.open('LOGO.PNG')
     st.image(
     image, width=250)
+    st.title("PDF Chatbot with RAG")
+    st.markdown("Google Flan-T5-Small + ChromaDB")
+    st.header('', divider='rainbow')
     st.write("Upload a PDF, and ask questions about its content!")
     client, collection = setup_chromadb()
+    retriever_model = pipeline("text2text-generation", model="google/flan-t5-small")
     # File upload
     uploaded_file = st.file_uploader("Upload your PDF file", type="pdf")
     if uploaded_file:
         try:
             collection = clear_collection(client, "pdf_data")
             st.info("Existing data cleared from the database.")
             pdf_text = extract_text_from_pdf(uploaded_file)
             st.success("Text extracted successfully!")
             st.text_area("Extracted Text:", pdf_text, height=300)
 if __name__ == "__main__":
     main()