import streamlit as st import numpy as np import faiss import requests import pdfplumber from io import BytesIO from sentence_transformers import SentenceTransformer from groq import Groq from urllib.parse import urlparse, parse_qs # Initialize the embedding model embed_model = SentenceTransformer('all-MiniLM-L6-v2') # Initialize Groq API API_KEY = "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" client = Groq(api_key=API_KEY) # Predefined Google Drive links STORED_LINKS = [ "https://drive.google.com/file/d/1zHtEpoEZv_3BhEDhQKkf1D1vya2jzyAd/view?usp=sharing", "https://drive.google.com/file/d/1xnRgDFGGV723Bgddf8KE9quwzpllgxyD/view?usp=sharing" ] # Helper function to extract file ID from Google Drive URL def extract_drive_file_id(url): parsed_url = urlparse(url) if 'drive.google.com' in parsed_url.netloc: return parse_qs(parsed_url.query).get('id', [None])[0] or parsed_url.path.split('/')[3] return None # Helper function to download PDF from Google Drive def download_pdf_from_drive(file_id): response = requests.get(f"https://drive.google.com/uc?id={file_id}&export=download") response.raise_for_status() return BytesIO(response.content) # Function to extract text from PDF def extract_text_from_pdf(pdf_file): with pdfplumber.open(pdf_file) as pdf: return ' '.join(page.extract_text() for page in pdf.pages if page.extract_text()) # Function to create embeddings and store them in FAISS def create_embeddings(text): chunks = [text[i:i + 500] for i in range(0, len(text), 500)] embeddings = embed_model.encode(chunks) index = faiss.IndexFlatL2(embeddings.shape[1]) index.add(embeddings) return chunks, embeddings, index # Function to find the most relevant chunk for the user's question def get_relevant_chunk(question, embeddings, index, chunks): question_embedding = embed_model.encode([question]) D, I = index.search(np.array(question_embedding).astype(np.float32), 1) # Retrieve top 1 chunk relevant_chunk = chunks[I[0][0]] return relevant_chunk # Function to get the model's response from Groq API def get_answer_from_groq(question, context): chat_completion = client.chat.completions.create( messages=[ {"role": "user", "content": f"Answer the following question based on the context:\nContext: {context}\nQuestion: {question}"} ], model="llama3-8b-8192", ) return chat_completion.choices[0].message.content # Streamlit app def main(): st.set_page_config(page_title="Google Drive RAG App", page_icon="📄", layout="centered") st.markdown("

Google Drive RAG Application

", unsafe_allow_html=True) st.write("Processing predefined document links from Google Drive to generate embeddings stored in a FAISS index.") # Process predefined links all_text = "" for link in STORED_LINKS: try: file_id = extract_drive_file_id(link) if file_id: st.write(f"📥 Processing document: {link}") pdf_file = download_pdf_from_drive(file_id) text = extract_text_from_pdf(pdf_file) all_text += text else: st.warning(f"⚠️ Invalid link: {link}") except Exception as e: st.error(f"❌ Failed to process link: {link}. Error: {e}") if all_text: st.success("✅ All documents processed successfully!") # Create embeddings st.write("🔄 Creating embeddings...") chunks, embeddings, index = create_embeddings(all_text) st.success("✅ Embeddings created and stored in FAISS index!") # Question section question = st.text_input("Ask a question based on the uploaded documents:") if question: relevant_chunk = get_relevant_chunk(question, embeddings, index, chunks) st.write("🔄 Retrieving the answer...") answer = get_answer_from_groq(question, relevant_chunk) st.subheader("Answer:") st.write(answer) if __name__ == "__main__": main()