Spaces:

NHZ
/

First_Aid_Kit

Sleeping

App Files Files Community

First_Aid_Kit / app.py

NHZ

Update app.py

e7ac282 verified about 1 year ago

raw

history blame contribute delete

4.33 kB

	import os
	import streamlit as st
	import requests
	import PyPDF2
	from groq import Groq
	from langchain.vectorstores import FAISS
	from langchain.embeddings import HuggingFaceEmbeddings
	from sentence_transformers import SentenceTransformer

	# Initialize Groq client
	client = Groq(api_key=os.getenv("GROQ_API_KEY"))

	# Function to extract text from a PDF
	def extract_text_from_pdf(pdf_url):
	# Convert Google Drive shareable link to direct download link
	direct_url = pdf_url.replace("/view?usp=sharing", "").replace("file/d/", "uc?id=")
	response = requests.get(direct_url)
	pdf_content = response.content
	with open("temp.pdf", "wb") as f:
	f.write(pdf_content)

	# Read the PDF content
	with open("temp.pdf", "rb") as f:
	reader = PyPDF2.PdfReader(f)
	text = ""
	for page in reader.pages:
	text += page.extract_text()
	os.remove("temp.pdf")
	return text

	# Function to chunk text manually
	def chunk_text(text, chunk_size=300):
	# Split text by spaces and process into chunks
	words = text.split()
	chunks = []
	current_chunk = []

	for word in words:
	if len(current_chunk) + len(word.split()) <= chunk_size:
	current_chunk.append(word)
	else:
	chunks.append(" ".join(current_chunk))
	current_chunk = [word]

	if current_chunk:
	chunks.append(" ".join(current_chunk))
	return chunks

	# Function to create embeddings and store them in FAISS using Langchain
	def create_faiss_index(chunks):
	# Use SentenceTransformer for embeddings
	embeddings_model = SentenceTransformer("all-MiniLM-L6-v2")
	embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

	# Create FAISS vector store
	doc_search = FAISS.from_texts(chunks, embeddings)
	return doc_search

	# Function to query FAISS and retrieve relevant document chunks
	def query_faiss(doc_search, query):
	results = doc_search.similarity_search(query, k=3)
	return [result.page_content for result in results]

	# Main Streamlit App
	def main():
	st.title("RAG-based Application")
	st.write("Interact with your document using Groq-powered model.")

	# Pre-defined document link
	doc_link = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing"

	# Extract Document Content
	if "document_text" not in st.session_state:
	st.write("Extracting document content...")
	text = extract_text_from_pdf(doc_link)
	st.session_state['document_text'] = text
	st.success("Document content extracted!")

	# Process Document and Create FAISS Index
	if 'document_text' in st.session_state and "faiss_index" not in st.session_state:
	st.write("Processing document...")
	chunks = chunk_text(st.session_state['document_text'])
	doc_search = create_faiss_index(chunks)
	st.session_state['faiss_index'] = doc_search
	st.session_state['chunks'] = chunks
	st.success(f"Document processed into {len(chunks)} chunks!")

	# Query the Document
	if 'faiss_index' in st.session_state:
	st.header("Ask Questions")
	query = st.text_input("Enter your question here")
	if st.button("Query Document"):
	results = query_faiss(st.session_state['faiss_index'], query)
	if not results:
	st.warning("No relevant context found in the document.")
	else:
	st.write("### Results from Document:")
	for i, result in enumerate(results):
	st.write(f"Result {i+1}: {result}")

	# Combine results to provide context
	context = "\n".join(results)
	st.write("### Insights based on Document Context:")
	prompt = (
	f"The following context is from the document:\n\n"
	f"{context}\n\n"
	f"Based on this context, answer the question:\n"
	f"{query}"
	)

	chat_completion = client.chat.completions.create(
	messages=[{"role": "user", "content": prompt}],
	model="llama-3.3-70b-versatile",
	)
	st.write(chat_completion.choices[0].message.content)

	if __name__ == "__main__":
	main()