Spaces:

Talha812
/

GenAI_SANDBOX_RAG_APP_DOC

Sleeping

App Files Files Community

GenAI_SANDBOX_RAG_APP_DOC / app.py

Talha812

Update app.py

50e1cd1 verified 9 months ago

raw

history blame contribute delete

3.36 kB

	import os
	import io
	import re
	import requests
	import faiss
	import numpy as np
	import streamlit as st
	from PyPDF2 import PdfReader
	from sentence_transformers import SentenceTransformer
	from groq import Groq

	# ============ CONFIG ============ #
	GROQ_API_KEY = os.getenv("GROQ_API_KEY")
	if not GROQ_API_KEY:
	st.error("❌ GROQ_API_KEY environment variable not found.")
	st.stop()

	client = Groq(api_key=GROQ_API_KEY)
	embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

	# Google Drive file links (shared by you)
	GDRIVE_LINKS = [
	"https://drive.google.com/file/d/1aBFrAktgTIFwYxNDiY75Gj-4gwqoUJbm/view?usp=sharing",
	"https://drive.google.com/file/d/1boqYWdtFqYagnVk7oeh6hRZb5Um2W9zC/view?usp=sharing"
	]

	# ============ UTILS ============ #
	def gdrive_to_direct(link):
	match = re.search(r"drive\.google\.com\/file\/d\/([^/]+)", link)
	if match:
	file_id = match.group(1)
	return f"https://drive.google.com/uc?export=download&id={file_id}"
	return None

	def fetch_pdf(url):
	response = requests.get(url, timeout=30)
	response.raise_for_status()
	return response.content

	def read_pdf_bytes(data):
	reader = PdfReader(io.BytesIO(data))
	text = ""
	for page in reader.pages:
	extracted = page.extract_text()
	if extracted:
	text += extracted
	return text

	def chunk_text(text, max_length=500):
	words = text.split()
	return [' '.join(words[i:i + max_length]) for i in range(0, len(words), max_length)]

	def create_faiss_index(chunks):
	embeddings = embedder.encode(chunks)
	dim = embeddings.shape[1]
	index = faiss.IndexFlatL2(dim)
	index.add(np.array(embeddings))
	return index, chunks

	def search_index(index, query, chunks, top_k=3):
	query_embedding = embedder.encode([query])
	D, I = index.search(np.array(query_embedding), top_k)
	return [chunks[i] for i in I[0]]

	# ============ STREAMLIT UI ============ #
	st.set_page_config(page_title="🧠 RAG Chat from Cloud PDFs", layout="wide")
	st.title("📄 Chat with 2 Google Drive PDFs (Auto-loaded)")

	with st.spinner("📥 Downloading and processing PDF documents..."):
	combined_text = ""
	for link in GDRIVE_LINKS:
	direct_url = gdrive_to_direct(link)
	if direct_url:
	try:
	pdf_bytes = fetch_pdf(direct_url)
	combined_text += read_pdf_bytes(pdf_bytes)
	except Exception as e:
	st.error(f"❌ Error fetching PDF from: {link}\n\n{e}")
	st.stop()
	else:
	st.error(f"❌ Invalid Google Drive link format: {link}")
	st.stop()

	chunks = chunk_text(combined_text)
	index, stored_chunks = create_faiss_index(chunks)

	st.success("✅ PDFs loaded and indexed. Ask your questions below!")

	# Input box for queries
	query = st.text_input("Ask a question based on the documents:")
	if query:
	with st.spinner("🔍 Searching and generating response..."):
	context = search_index(index, query, stored_chunks)
	prompt = "\n".join(context) + f"\n\nQuestion: {query}"
	response = client.chat.completions.create(
	model="llama-3.3-70b-versatile",
	messages=[{"role": "user", "content": prompt}]
	)
	answer = response.choices[0].message.content.strip()
	st.markdown(f"Answer: {answer}")