import os import streamlit as st from PyPDF2 import PdfReader from sentence_transformers import SentenceTransformer import faiss import numpy as np from groq import Groq # Initialize Groq Client client = Groq(api_key=os.getenv("groq_api_key")) # Load embedding model embedding_model = SentenceTransformer("all-MiniLM-L6-v2") # Initialize FAISS vector store dimension = 384 # Embedding dimension of the model index = faiss.IndexFlatL2(dimension) # Function to extract text from PDF def extract_text_from_pdf(pdf_file): reader = PdfReader(pdf_file) text = "" for page in reader.pages: text += page.extract_text() return text # Function to split text into chunks def chunk_text(text, chunk_size=500): words = text.split() return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)] # Function to add embeddings to vector database def add_to_vector_db(chunks): embeddings = embedding_model.encode(chunks) index.add(np.array(embeddings, dtype="float32")) return embeddings # Streamlit frontend st.title("RAG-based PDF Query Application") # PDF upload uploaded_file = st.file_uploader("Upload your PDF file", type=["pdf"]) if uploaded_file: st.write("Processing your PDF...") text = extract_text_from_pdf(uploaded_file) chunks = chunk_text(text) add_to_vector_db(chunks) st.success("PDF processed and embeddings stored in the vector database!") # Query input query = st.text_input("Enter your query:") if query: # Generate embedding for query query_embedding = embedding_model.encode([query]) # Retrieve relevant chunks from FAISS distances, indices = index.search(np.array(query_embedding, dtype="float32"), k=5) context = "\n".join([chunks[i] for i in indices[0]]) # Interact with Groq API chat_completion = client.chat.completions.create( messages=[ { "role": "user", "content": f"Context: {context}\n\nQuery: {query}" } ], model="llama3-8b-8192", stream=False, ) response = chat_completion.choices[0].message.content # Display response st.write("Response:") st.write(response)