import os import streamlit as st import numpy as np import faiss from groq import Groq from pydrive.auth import GoogleAuth from pydrive.drive import GoogleDrive from sentence_transformers import SentenceTransformer # Constants DRIVE_FILE_LINK = "https://drive.google.com/file/d/1kYGomSibXW-wCFptEMcWP12jOz1390OK/view?usp=drive_link" GROQ_MODEL = "llama-3.3-70b-versatile" # Authentication and setup for Google Drive @st.cache_resource def load_drive_content(file_link): gauth = GoogleAuth() gauth.LocalWebserverAuth() drive = GoogleDrive(gauth) file_id = file_link.split('/d/')[1].split('/view')[0] downloaded_file = drive.CreateFile({'id': file_id}) downloaded_file.GetContentFile("document.pdf") return "document.pdf" # Chunking and embedding creation @st.cache_resource def prepare_embeddings(document_path): from PyPDF2 import PdfReader reader = PdfReader(document_path) text = "" for page in reader.pages: text += page.extract_text() # Create chunks of 500 characters with a sliding window of 200 chunk_size = 500 chunk_overlap = 200 chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size - chunk_overlap)] # Embedding model embedder = SentenceTransformer("all-MiniLM-L6-v2") embeddings = embedder.encode(chunks, convert_to_tensor=True).detach().numpy() # Store in FAISS vector_dim = embeddings.shape[1] index = faiss.IndexFlatL2(vector_dim) index.add(embeddings) return chunks, index # Groq setup @st.cache_resource def groq_client(): return Groq(api_key=os.environ.get("GROQ_API_KEY")) # Retrieve and query vector DB def query_vector_db(query, chunks, index, embedder): query_embedding = embedder.encode([query], convert_to_tensor=True).detach().numpy() D, I = index.search(query_embedding, k=1) # Find top result if I[0][0] != -1: # Valid match return chunks[I[0][0]] return "No relevant content found." # Streamlit application def main(): st.title("RAG-based Application with Groq") # Load document and prepare FAISS st.info("Loading document and preparing FAISS...") document_path = load_drive_content(DRIVE_FILE_LINK) chunks, index = prepare_embeddings(document_path) embedder = SentenceTransformer("all-MiniLM-L6-v2") client = groq_client() # Interface user_input = st.text_input("Enter your query:") if user_input: context = query_vector_db(user_input, chunks, index, embedder) st.write("**Relevant Context:**", context) # Query Groq model with st.spinner("Querying Groq model..."): chat_completion = client.chat.completions.create( messages=[ {"role": "user", "content": f"Based on this context: {context}, {user_input}"} ], model=GROQ_MODEL, ) st.write("**Groq Model Response:**", chat_completion.choices[0].message.content) if __name__ == "__main__": main()