import PyPDF2 from sentence_transformers import SentenceTransformer import chromadb from transformers import pipeline from langchain.text_splitter import RecursiveCharacterTextSplitter import streamlit as st def extract_text_from_pdf(pdf_path): try: with open(pdf_path, 'rb') as file: reader = PyPDF2.PdfReader(file) if reader.is_encrypted: st.error("This PDF is encrypted. Encryption support is not available in this version.") return None text = "" for page in reader.pages: text += page.extract_text() or "" # Handle None from extract_text return text except Exception as e: st.error(f"Error reading PDF: {str(e)}") return None def chunk_text(text): splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) chunks = splitter.split_text(text) return chunks def store_in_vector_db(chunks): # Use EphemeralClient for in-memory, no tenant/database setup client = chromadb.EphemeralClient() model = SentenceTransformer('all-MiniLM-L6-v2') embeddings = model.encode(chunks) collection = client.create_collection("pdf_chunks") collection.add(documents=chunks, embeddings=embeddings.tolist(), ids=[f"chunk_{i}" for i in range(len(chunks))]) return collection, model def retrieve_and_generate(query, collection, embedding_model): query_embedding = embedding_model.encode([query]).tolist() results = collection.query(query_embeddings=query_embedding, n_results=3) context = " ".join(results['documents'][0]) generator = pipeline('text-generation', model='facebook/bart-large', max_length=100) response = generator(f"Question: {query}\nContext: {context}")[0]['generated_text'] return response def main(): st.title("RAG PDF Q&A") st.write("This is a helpful AI tutor, and its prime responsibility is to explain concepts to students.") uploaded_file = st.file_uploader("Upload a PDF", type="pdf") if uploaded_file: with open("temp.pdf", "wb") as f: f.write(uploaded_file.getbuffer()) with st.spinner("Processing PDF..."): text = extract_text_from_pdf("temp.pdf") if text is None: return chunks = chunk_text(text) collection, embedding_model = store_in_vector_db(chunks) st.success("PDF processed successfully!") query = st.text_input("Ask a question about the PDF:") if query: with st.spinner("Generating response..."): response = retrieve_and_generate(query, collection, embedding_model) st.text_area("Response", value=response, height=200) if __name__ == "__main__": main()