Spaces:

engrrifatullah
/

RAG_Based_Application_with_Google_Drive_Link

Sleeping

App Files Files Community

RAG_Based_Application_with_Google_Drive_Link / app.py

engrrifatullah

Update app.py

1ff4cc0 verified over 1 year ago

raw

history blame contribute delete

4.16 kB

	import streamlit as st
	import numpy as np
	import faiss
	import requests
	import pdfplumber
	from io import BytesIO
	from sentence_transformers import SentenceTransformer
	from groq import Groq
	from urllib.parse import urlparse, parse_qs

	# Initialize the embedding model
	embed_model = SentenceTransformer('all-MiniLM-L6-v2')

	# Initialize Groq API
	API_KEY = "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"
	client = Groq(api_key=API_KEY)

	# Predefined Google Drive links
	STORED_LINKS = [
	"https://drive.google.com/file/d/1zHtEpoEZv_3BhEDhQKkf1D1vya2jzyAd/view?usp=sharing",
	"https://drive.google.com/file/d/1xnRgDFGGV723Bgddf8KE9quwzpllgxyD/view?usp=sharing"
	]

	# Helper function to extract file ID from Google Drive URL
	def extract_drive_file_id(url):
	parsed_url = urlparse(url)
	if 'drive.google.com' in parsed_url.netloc:
	return parse_qs(parsed_url.query).get('id', [None])[0] or parsed_url.path.split('/')[3]
	return None

	# Helper function to download PDF from Google Drive
	def download_pdf_from_drive(file_id):
	response = requests.get(f"https://drive.google.com/uc?id={file_id}&export=download")
	response.raise_for_status()
	return BytesIO(response.content)

	# Function to extract text from PDF
	def extract_text_from_pdf(pdf_file):
	with pdfplumber.open(pdf_file) as pdf:
	return ' '.join(page.extract_text() for page in pdf.pages if page.extract_text())

	# Function to create embeddings and store them in FAISS
	def create_embeddings(text):
	chunks = [text[i:i + 500] for i in range(0, len(text), 500)]
	embeddings = embed_model.encode(chunks)
	index = faiss.IndexFlatL2(embeddings.shape[1])
	index.add(embeddings)
	return chunks, embeddings, index

	# Function to find the most relevant chunk for the user's question
	def get_relevant_chunk(question, embeddings, index, chunks):
	question_embedding = embed_model.encode([question])
	D, I = index.search(np.array(question_embedding).astype(np.float32), 1) # Retrieve top 1 chunk
	relevant_chunk = chunks[I[0][0]]
	return relevant_chunk

	# Function to get the model's response from Groq API
	def get_answer_from_groq(question, context):
	chat_completion = client.chat.completions.create(
	messages=[
	{"role": "user", "content": f"Answer the following question based on the context:\nContext: {context}\nQuestion: {question}"}
	],
	model="llama3-8b-8192",
	)
	return chat_completion.choices[0].message.content

	# Streamlit app
	def main():
	st.set_page_config(page_title="Google Drive RAG App", page_icon="📄", layout="centered")
	st.markdown("<h1 style='text-align: center;'>Google Drive RAG Application</h1>", unsafe_allow_html=True)

	st.write("Processing predefined document links from Google Drive to generate embeddings stored in a FAISS index.")

	# Process predefined links
	all_text = ""
	for link in STORED_LINKS:
	try:
	file_id = extract_drive_file_id(link)
	if file_id:
	st.write(f"📥 Processing document: {link}")
	pdf_file = download_pdf_from_drive(file_id)
	text = extract_text_from_pdf(pdf_file)
	all_text += text
	else:
	st.warning(f"⚠️ Invalid link: {link}")
	except Exception as e:
	st.error(f"❌ Failed to process link: {link}. Error: {e}")

	if all_text:
	st.success("✅ All documents processed successfully!")

	# Create embeddings
	st.write("🔄 Creating embeddings...")
	chunks, embeddings, index = create_embeddings(all_text)
	st.success("✅ Embeddings created and stored in FAISS index!")

	# Question section
	question = st.text_input("Ask a question based on the uploaded documents:")
	if question:
	relevant_chunk = get_relevant_chunk(question, embeddings, index, chunks)
	st.write("🔄 Retrieving the answer...")
	answer = get_answer_from_groq(question, relevant_chunk)
	st.subheader("Answer:")
	st.write(answer)

	if __name__ == "__main__":
	main()