First_Aid_Kit / app.py
NHZ's picture
Update app.py
af61ce0 verified
raw
history blame
2.86 kB
import os
import re
import requests
import pdfplumber
import streamlit as st
import faiss
from sentence_transformers import SentenceTransformer
# Constants
DOCUMENT_URL = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing"
CHUNK_SIZE = 500
# Function to download document
def download_document(file_url):
file_id = file_url.split("/d/")[1].split("/")[0]
download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
response = requests.get(download_url)
output = "document.pdf"
with open(output, "wb") as f:
f.write(response.content)
return output
# Extract text from PDF
def extract_text_from_pdf(file_path):
text = ""
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
text += page.extract_text()
return text
# Chunk text into smaller parts
def chunk_text(text, chunk_size=CHUNK_SIZE):
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
chunks, current_chunk = [], ""
for sentence in sentences:
if len(current_chunk) + len(sentence) < chunk_size:
current_chunk += sentence + " "
else:
chunks.append(current_chunk.strip())
current_chunk = sentence + " "
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
# Vectorize and store in FAISS
def create_faiss_index(chunks, model):
embeddings = model.encode(chunks)
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
return index, embeddings
# Query FAISS index
def query_faiss(query, index, chunks, model, k=5):
query_embedding = model.encode([query])
distances, indices = index.search(query_embedding, k)
return [chunks[i] for i in indices[0]]
# Streamlit application
def main():
st.title("Document-Based Query Application")
st.write("This application uses a pre-configured document as the dataset for answering queries.")
# Download and process the document
st.write("Processing the pre-configured document...")
document_path = download_document(DOCUMENT_URL)
text = extract_text_from_pdf(document_path)
chunks = chunk_text(text)
# Create FAISS index
st.write("Creating FAISS index...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
index, embeddings = create_faiss_index(chunks, embedding_model)
st.success("Document processed and indexed!")
# Query the database
query = st.text_input("Enter your query")
if query:
st.write("Fetching relevant content from the document...")
results = query_faiss(query, index, chunks, embedding_model)
st.write("Top relevant chunks:")
for i, result in enumerate(results):
st.write(f"{i+1}. {result}")
if __name__ == "__main__":
main()