swisscondor commited on
Commit
7fdfc68
·
verified ·
1 Parent(s): 79c02f2

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -0
app.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import chromadb
3
+ import torch
4
+ from transformers import pipeline
5
+ from PyPDF2 import PdfReader
6
+ import os
7
+
8
+ # Initialize Hugging Face pipeline for question answering
9
+ def load_qa_pipeline():
10
+ return pipeline("question-answering", model="deepset/roberta-base-squad2")
11
+
12
+ # Extract text from PDF
13
+ def extract_pdf_text(pdf_file):
14
+ reader = PdfReader(pdf_file)
15
+ text = ""
16
+ for page in reader.pages:
17
+ text += page.extract_text() + "\n"
18
+ return text
19
+
20
+ # Split text into chunks
21
+ def split_text_into_chunks(text, chunk_size=500, overlap=100):
22
+ chunks = []
23
+ for i in range(0, len(text), chunk_size - overlap):
24
+ chunks.append(text[i:i+chunk_size])
25
+ return chunks
26
+
27
+ # Create ChromaDB collection
28
+ def create_chroma_collection(chunks):
29
+ # Use persistent client to avoid memory issues
30
+ client = chromadb.PersistentClient(path="./chroma_db")
31
+
32
+ # Create a unique collection name
33
+ collection_name = f"pdf_qa_collection_{int(torch.rand(1).item() * 10000)}"
34
+
35
+ # Create collection
36
+ collection = client.create_collection(name=collection_name)
37
+
38
+ # Add chunks to collection
39
+ for i, chunk in enumerate(chunks):
40
+ collection.add(
41
+ ids=[f"chunk_{i}"],
42
+ documents=[chunk]
43
+ )
44
+
45
+ return client, collection, collection_name
46
+
47
+ # Retrieve most relevant context
48
+ def retrieve_context(collection, question, top_k=3):
49
+ results = collection.query(
50
+ query_texts=[question],
51
+ n_results=top_k
52
+ )
53
+ return results['documents'][0]
54
+
55
+ # Main Streamlit app
56
+ def main():
57
+ st.title("PDF Question Answering App")
58
+
59
+ # File uploader
60
+ uploaded_file = st.file_uploader("Upload PDF", type=['pdf'])
61
+
62
+ # Question input
63
+ question = st.text_input("Enter your question")
64
+
65
+ # Run button
66
+ if st.button("Get Answer"):
67
+ if uploaded_file and question:
68
+ try:
69
+ # Load QA pipeline
70
+ qa_pipeline = load_qa_pipeline()
71
+
72
+ # Extract PDF text
73
+ pdf_text = extract_pdf_text(uploaded_file)
74
+
75
+ # Split text into chunks
76
+ text_chunks = split_text_into_chunks(pdf_text)
77
+
78
+ # Create ChromaDB collection
79
+ client, collection, collection_name = create_chroma_collection(text_chunks)
80
+
81
+ # Retrieve context
82
+ contexts = retrieve_context(collection, question)
83
+
84
+ # Prepare answers
85
+ answers = []
86
+ for context in contexts:
87
+ result = qa_pipeline(question=question, context=context)
88
+ answers.append(result)
89
+
90
+ # Display best answer
91
+ best_answer = max(answers, key=lambda x: x['score'])
92
+ st.write("Answer:", best_answer['answer'])
93
+ st.write("Confidence Score:", best_answer['score'])
94
+
95
+ # Clean up ChromaDB collection
96
+ client.delete_collection(name=collection_name)
97
+
98
+ except Exception as e:
99
+ st.error(f"An error occurred: {e}")
100
+
101
+ if __name__ == "__main__":
102
+ main()