import os from io import BytesIO import streamlit as st from PyPDF2 import PdfReader from langchain.embeddings import OpenAIEmbeddings from langchain.vectorstores import FAISS from groq import Groq import tempfile from sentence_transformers import SentenceTransformer import faiss import numpy as np # Initialize Groq API Client client = Groq(api_key="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") # Helper Functions def extract_text_from_pdf(pdf_file): """Extract text from uploaded PDF file.""" reader = PdfReader(pdf_file) text = "" for page in reader.pages: page_text = page.extract_text() if page_text: # Ensure we don't add None text += page_text return text def create_chunks(text, chunk_size=500): """Chunk the text into smaller pieces for processing.""" return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)] def create_embeddings(chunks): """Create embeddings for text chunks using SentenceTransformers.""" if not chunks: raise ValueError("No text chunks provided for embedding.") model = SentenceTransformer("all-MiniLM-L6-v2") embeddings = model.encode(chunks) # Ensure embeddings is 2D even for single chunk if len(embeddings.shape) == 1: embeddings = np.expand_dims(embeddings, axis=0) dimension = embeddings.shape[1] faiss_index = faiss.IndexFlatL2(dimension) faiss_index.add(embeddings.astype('float32')) # FAISS expects float32 return faiss_index def interact_with_model(query, faiss_index, chunks): """Interact with the model using a query and FAISS index.""" model = SentenceTransformer("all-MiniLM-L6-v2") query_embedding = model.encode([query]) # Search FAISS index distances, indices = faiss_index.search(query_embedding.astype('float32'), k=3) # Retrieve relevant chunks docs = [chunks[i] for i in indices[0] if i < len(chunks)] context = " ".join(docs) messages = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": f"Context: {context}\n\n{query}"}, ] chat_completion = client.chat.completions.create( messages=messages, model="llama-3.3-70b-versatile" ) return chat_completion.choices[0].message.content # Streamlit Frontend def main(): st.title("PDF Query App") uploaded_file = st.file_uploader("Upload your PDF file", type=["pdf"]) if uploaded_file is not None: text = extract_text_from_pdf(uploaded_file) if not text.strip(): st.error("PDF contains no extractable text. Upload a valid PDF.") return chunks = create_chunks(text) if not chunks: st.error("No text chunks created. Check PDF content.") return try: faiss_index = create_embeddings(chunks) except Exception as e: st.error(f"Error creating embeddings: {str(e)}") return query = st.text_input("Ask a question about the PDF:") if query: response = interact_with_model(query, faiss_index, chunks) st.write(response) if __name__ == "__main__": main()