Spaces:

NHZ
/

First_Aid_Kit

Sleeping

App Files Files Community

First_Aid_Kit / app.py

NHZ

Update app.py

7e5ff22 verified over 1 year ago

raw

history blame

3.45 kB

	import os
	import streamlit as st
	import requests
	import PyPDF2
	from sentence_transformers import SentenceTransformer
	import faiss
	import nltk
	from groq import Groq

	# Ensure the punkt tokenizer is downloaded
	try:
	nltk.data.find('tokenizers/punkt')
	except LookupError:
	nltk.download('punkt')

	# Initialize Groq client
	client = Groq(api_key=os.getenv("GROQ_API_KEY"))

	# Function to extract text from a PDF
	def extract_text_from_pdf(pdf_url):
	# Convert Google Drive shareable link to direct download link
	direct_url = pdf_url.replace("/view?usp=sharing", "").replace("file/d/", "uc?id=")
	response = requests.get(direct_url)
	pdf_content = response.content
	with open("temp.pdf", "wb") as f:
	f.write(pdf_content)

	# Read the PDF content
	with open("temp.pdf", "rb") as f:
	reader = PyPDF2.PdfReader(f)
	text = ""
	for page in reader.pages:
	text += page.extract_text()
	os.remove("temp.pdf")
	return text

	# Function to chunk text
	def chunk_text(text, chunk_size=300):
	sentences = nltk.sent_tokenize(text)
	chunks = []
	current_chunk = []
	current_length = 0

	for sentence in sentences:
	current_length += len(sentence.split())
	if current_length <= chunk_size:
	current_chunk.append(sentence)
	else:
	chunks.append(" ".join(current_chunk))
	current_chunk = [sentence]
	current_length = len(sentence.split())

	if current_chunk:
	chunks.append(" ".join(current_chunk))
	return chunks

	# Function to create embeddings and store them in FAISS
	def create_faiss_index(chunks):
	model = SentenceTransformer("all-MiniLM-L6-v2")
	embeddings = model.encode(chunks)
	dimension = embeddings.shape[1]
	index = faiss.IndexFlatL2(dimension)
	index.add(embeddings)
	return index, embeddings

	# Function to query FAISS
	def query_faiss(index, query, chunks, model):
	query_vector = model.encode([query])
	distances, indices = index.search(query_vector, k=3)
	results = [chunks[i] for i in indices[0]]
	return results

	# Main Streamlit App
	def main():
	st.title("RAG-based Application")
	st.write("Interact with your document using Groq-powered model.")

	# Pre-defined document link
	doc_link = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing"

	# Extract Document Content
	if "document_text" not in st.session_state:
	st.write("Extracting document content...")
	text = extract_text_from_pdf(doc_link)
	st.session_state['document_text'] = text
	st.success("Document content extracted!")

	# Process Document and Create FAISS Index
	if 'document_text' in st.session_state and "faiss_index" not in st.session_state:
	st.write("Processing document...")
	chunks = chunk_text(st.session_state['document_text'])
	index, embeddings = create_faiss_index(chunks)
	st.session_state['faiss_index'] = index
	st.session_state['chunks'] = chunks
	st.session_state['model'] = SentenceTransformer("all-MiniLM-L6-v2")
	st.success(f"Document processed into {len(chunks)} chunks!")

	# Query the Document
	if 'faiss_index' in st.session_state:
	st.header("Ask Questions")
	query = st.text_input("Enter your question here")
	if st.button("Query Document"):
	results = query_faiss(st.session_state['faiss_index'],