import os import faiss import numpy as np import streamlit as st from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores.faiss import FAISS from langchain.document_loaders import PyPDFLoader from sentence_transformers import SentenceTransformer from groq import Groq # Initialize Groq API Client GROQ_API_KEY = os.getenv("GROQ_API_KEY") client = Groq(api_key=GROQ_API_KEY) # Load sentence-transformer model (open-source embedding model) embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") # Function to process PDF and create embeddings def process_pdf(pdf_file): loader = PyPDFLoader(pdf_file) documents = loader.load() # Chunk text text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50) chunks = text_splitter.split_documents(documents) # Generate embeddings texts = [chunk.page_content for chunk in chunks] embeddings = embedding_model.encode(texts, convert_to_numpy=True) # Store embeddings in FAISS dimension = embeddings.shape[1] index = faiss.IndexFlatL2(dimension) index.add(embeddings) faiss.write_index(index, "faiss_index") return texts, index # Function to search in FAISS def search_faiss(query, texts, index, top_k=3): query_embedding = embedding_model.encode([query], convert_to_numpy=True) D, I = index.search(query_embedding, top_k) results = [texts[i] for i in I[0] if i != -1] return results # Function to retrieve response from Groq API def get_groq_response(query): chat_completion = client.chat.completions.create( messages=[{"role": "user", "content": query}], model="llama-3.3-70b-versatile" ) return chat_completion.choices[0].message.content # Streamlit UI st.title("RAG-Based PDF Chatbot") uploaded_file = st.file_uploader("Upload a PDF", type="pdf") if uploaded_file: with open("uploaded_document.pdf", "wb") as f: f.write(uploaded_file.getbuffer()) st.success("PDF uploaded successfully!") texts, index = process_pdf("uploaded_document.pdf") query = st.text_input("Ask a question about the document:") if query: docs = search_faiss(query, texts, index) context = " ".join(docs) final_query = f"Context: {context} \n\n Question: {query}" response = get_groq_response(final_query) st.write("Response:", response)