Spaces:

NHZ
/

First_Aid_Kit

Sleeping

App Files Files Community

First_Aid_Kit / app.py

NHZ

Update app.py

af61ce0 verified about 1 year ago

raw

history blame

2.86 kB

	import os
	import re
	import requests
	import pdfplumber
	import streamlit as st
	import faiss
	from sentence_transformers import SentenceTransformer

	# Constants
	DOCUMENT_URL = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing"
	CHUNK_SIZE = 500

	# Function to download document
	def download_document(file_url):
	file_id = file_url.split("/d/")[1].split("/")[0]
	download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
	response = requests.get(download_url)
	output = "document.pdf"
	with open(output, "wb") as f:
	f.write(response.content)
	return output

	# Extract text from PDF
	def extract_text_from_pdf(file_path):
	text = ""
	with pdfplumber.open(file_path) as pdf:
	for page in pdf.pages:
	text += page.extract_text()
	return text

	# Chunk text into smaller parts
	def chunk_text(text, chunk_size=CHUNK_SIZE):
	sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.\|\?)\s', text)
	chunks, current_chunk = [], ""
	for sentence in sentences:
	if len(current_chunk) + len(sentence) < chunk_size:
	current_chunk += sentence + " "
	else:
	chunks.append(current_chunk.strip())
	current_chunk = sentence + " "
	if current_chunk:
	chunks.append(current_chunk.strip())
	return chunks

	# Vectorize and store in FAISS
	def create_faiss_index(chunks, model):
	embeddings = model.encode(chunks)
	dimension = embeddings.shape[1]
	index = faiss.IndexFlatL2(dimension)
	index.add(embeddings)
	return index, embeddings

	# Query FAISS index
	def query_faiss(query, index, chunks, model, k=5):
	query_embedding = model.encode([query])
	distances, indices = index.search(query_embedding, k)
	return [chunks[i] for i in indices[0]]

	# Streamlit application
	def main():
	st.title("Document-Based Query Application")
	st.write("This application uses a pre-configured document as the dataset for answering queries.")

	# Download and process the document
	st.write("Processing the pre-configured document...")
	document_path = download_document(DOCUMENT_URL)
	text = extract_text_from_pdf(document_path)
	chunks = chunk_text(text)

	# Create FAISS index
	st.write("Creating FAISS index...")
	embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
	index, embeddings = create_faiss_index(chunks, embedding_model)
	st.success("Document processed and indexed!")

	# Query the database
	query = st.text_input("Enter your query")
	if query:
	st.write("Fetching relevant content from the document...")
	results = query_faiss(query, index, chunks, embedding_model)
	st.write("Top relevant chunks:")
	for i, result in enumerate(results):
	st.write(f"{i+1}. {result}")

	if __name__ == "__main__":
	main()