import os import io import re import requests import faiss import numpy as np import streamlit as st from PyPDF2 import PdfReader from sentence_transformers import SentenceTransformer from groq import Groq # ============ CONFIG ============ # GROQ_API_KEY = os.getenv("GROQ_API_KEY") if not GROQ_API_KEY: st.error("❌ GROQ_API_KEY environment variable not found.") st.stop() client = Groq(api_key=GROQ_API_KEY) embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") # Google Drive file links (shared by you) GDRIVE_LINKS = [ "https://drive.google.com/file/d/1aBFrAktgTIFwYxNDiY75Gj-4gwqoUJbm/view?usp=sharing", "https://drive.google.com/file/d/1boqYWdtFqYagnVk7oeh6hRZb5Um2W9zC/view?usp=sharing" ] # ============ UTILS ============ # def gdrive_to_direct(link): match = re.search(r"drive\.google\.com\/file\/d\/([^/]+)", link) if match: file_id = match.group(1) return f"https://drive.google.com/uc?export=download&id={file_id}" return None def fetch_pdf(url): response = requests.get(url, timeout=30) response.raise_for_status() return response.content def read_pdf_bytes(data): reader = PdfReader(io.BytesIO(data)) text = "" for page in reader.pages: extracted = page.extract_text() if extracted: text += extracted return text def chunk_text(text, max_length=500): words = text.split() return [' '.join(words[i:i + max_length]) for i in range(0, len(words), max_length)] def create_faiss_index(chunks): embeddings = embedder.encode(chunks) dim = embeddings.shape[1] index = faiss.IndexFlatL2(dim) index.add(np.array(embeddings)) return index, chunks def search_index(index, query, chunks, top_k=3): query_embedding = embedder.encode([query]) D, I = index.search(np.array(query_embedding), top_k) return [chunks[i] for i in I[0]] # ============ STREAMLIT UI ============ # st.set_page_config(page_title="🧠 RAG Chat from Cloud PDFs", layout="wide") st.title("📄 Chat with 2 Google Drive PDFs (Auto-loaded)") with st.spinner("📥 Downloading and processing PDF documents..."): combined_text = "" for link in GDRIVE_LINKS: direct_url = gdrive_to_direct(link) if direct_url: try: pdf_bytes = fetch_pdf(direct_url) combined_text += read_pdf_bytes(pdf_bytes) except Exception as e: st.error(f"❌ Error fetching PDF from: {link}\n\n{e}") st.stop() else: st.error(f"❌ Invalid Google Drive link format: {link}") st.stop() chunks = chunk_text(combined_text) index, stored_chunks = create_faiss_index(chunks) st.success("✅ PDFs loaded and indexed. Ask your questions below!") # Input box for queries query = st.text_input("Ask a question based on the documents:") if query: with st.spinner("🔍 Searching and generating response..."): context = search_index(index, query, stored_chunks) prompt = "\n".join(context) + f"\n\nQuestion: {query}" response = client.chat.completions.create( model="llama-3.3-70b-versatile", messages=[{"role": "user", "content": prompt}] ) answer = response.choices[0].message.content.strip() st.markdown(f"**Answer:** {answer}")