Spaces:

swisscondor
/

QuickQuery

Sleeping

App Files Files Community

swisscondor commited on Dec 15, 2024

Commit

7fdfc68

verified ·

1 Parent(s): 79c02f2

Create app.py

Browse files

Files changed (1) hide show

app.py +102 -0

app.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import streamlit as st
+import chromadb
+import torch
+from transformers import pipeline
+from PyPDF2 import PdfReader
+import os
+# Initialize Hugging Face pipeline for question answering
+def load_qa_pipeline():
+    return pipeline("question-answering", model="deepset/roberta-base-squad2")
+# Extract text from PDF
+def extract_pdf_text(pdf_file):
+    reader = PdfReader(pdf_file)
+    text = ""
+    for page in reader.pages:
+        text += page.extract_text() + "\n"
+    return text
+# Split text into chunks
+def split_text_into_chunks(text, chunk_size=500, overlap=100):
+    chunks = []
+    for i in range(0, len(text), chunk_size - overlap):
+        chunks.append(text[i:i+chunk_size])
+    return chunks
+# Create ChromaDB collection
+def create_chroma_collection(chunks):
+    # Use persistent client to avoid memory issues
+    client = chromadb.PersistentClient(path="./chroma_db")
+    # Create a unique collection name
+    collection_name = f"pdf_qa_collection_{int(torch.rand(1).item() * 10000)}"
+    # Create collection
+    collection = client.create_collection(name=collection_name)
+    # Add chunks to collection
+    for i, chunk in enumerate(chunks):
+        collection.add(
+            ids=[f"chunk_{i}"],
+            documents=[chunk]
+        )
+    return client, collection, collection_name
+# Retrieve most relevant context
+def retrieve_context(collection, question, top_k=3):
+    results = collection.query(
+        query_texts=[question],
+        n_results=top_k
+    )
+    return results['documents'][0]
+# Main Streamlit app
+def main():
+    st.title("PDF Question Answering App")
+    # File uploader
+    uploaded_file = st.file_uploader("Upload PDF", type=['pdf'])
+    # Question input
+    question = st.text_input("Enter your question")
+    # Run button
+    if st.button("Get Answer"):
+        if uploaded_file and question:
+            try:
+                # Load QA pipeline
+                qa_pipeline = load_qa_pipeline()
+                # Extract PDF text
+                pdf_text = extract_pdf_text(uploaded_file)
+                # Split text into chunks
+                text_chunks = split_text_into_chunks(pdf_text)
+                # Create ChromaDB collection
+                client, collection, collection_name = create_chroma_collection(text_chunks)
+                # Retrieve context
+                contexts = retrieve_context(collection, question)
+                # Prepare answers
+                answers = []
+                for context in contexts:
+                    result = qa_pipeline(question=question, context=context)
+                    answers.append(result)
+                # Display best answer
+                best_answer = max(answers, key=lambda x: x['score'])
+                st.write("Answer:", best_answer['answer'])
+                st.write("Confidence Score:", best_answer['score'])
+                # Clean up ChromaDB collection
+                client.delete_collection(name=collection_name)
+            except Exception as e:
+                st.error(f"An error occurred: {e}")
+if __name__ == "__main__":
+    main()