import os import PyPDF2 import faiss import streamlit as st from sentence_transformers import SentenceTransformer from groq import Groq # Set up Groq client client = Groq(api_key="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") # Initialize model and FAISS index embedding_model = SentenceTransformer('all-MiniLM-L6-v2') index = faiss.IndexFlatL2(384) # Adjust dimension to match the embedding size # PDF text extraction def extract_text_from_pdf(pdf_file): pdf_reader = PyPDF2.PdfReader(pdf_file) text = "" for page in pdf_reader.pages: text += page.extract_text() return text # Text chunking def chunk_text(text, chunk_size=500): return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] # Embed and store in FAISS def embed_and_store(chunks): embeddings = embedding_model.encode(chunks) index.add(embeddings) return embeddings # Retrieve relevant chunks def retrieve_chunks(query, top_k=5): query_embedding = embedding_model.encode([query]) distances, indices = index.search(query_embedding, top_k) return indices # Query Groq API def query_groq(prompt): chat_completion = client.chat.completions.create( messages=[{"role": "user", "content": prompt}], model="llama3-8b-8192" ) return chat_completion.choices[0].message.content # Streamlit UI def main(): st.title("RAG-based PDF QA System") st.sidebar.header("Upload and Interact") uploaded_file = st.sidebar.file_uploader("Upload a PDF", type=["pdf"]) if uploaded_file: st.sidebar.success("PDF Uploaded Successfully!") text = extract_text_from_pdf(uploaded_file) chunks = chunk_text(text) embed_and_store(chunks) st.write("PDF content has been processed and stored.") query = st.text_input("Enter your question:") if query: indices = retrieve_chunks(query) relevant_chunks = [chunks[i] for i in indices[0]] prompt = " ".join(relevant_chunks) + f"\n\nQuestion: {query}" answer = query_groq(prompt) st.write("### Answer:") st.write(answer) if __name__ == "__main__": main()