Spaces:
Sleeping
Sleeping
| import PyPDF2 | |
| from sentence_transformers import SentenceTransformer | |
| import chromadb | |
| from transformers import pipeline | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| import streamlit as st | |
| def extract_text_from_pdf(pdf_path): | |
| try: | |
| with open(pdf_path, 'rb') as file: | |
| reader = PyPDF2.PdfReader(file) | |
| if reader.is_encrypted: | |
| st.error("This PDF is encrypted. Encryption support is not available in this version.") | |
| return None | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() or "" # Handle None from extract_text | |
| return text | |
| except Exception as e: | |
| st.error(f"Error reading PDF: {str(e)}") | |
| return None | |
| def chunk_text(text): | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) | |
| chunks = splitter.split_text(text) | |
| return chunks | |
| def store_in_vector_db(chunks): | |
| # Use EphemeralClient for in-memory, no tenant/database setup | |
| client = chromadb.EphemeralClient() | |
| model = SentenceTransformer('all-MiniLM-L6-v2') | |
| embeddings = model.encode(chunks) | |
| collection = client.create_collection("pdf_chunks") | |
| collection.add(documents=chunks, embeddings=embeddings.tolist(), ids=[f"chunk_{i}" for i in range(len(chunks))]) | |
| return collection, model | |
| def retrieve_and_generate(query, collection, embedding_model): | |
| query_embedding = embedding_model.encode([query]).tolist() | |
| results = collection.query(query_embeddings=query_embedding, n_results=3) | |
| context = " ".join(results['documents'][0]) | |
| generator = pipeline('text-generation', model='facebook/bart-large', max_length=100) | |
| response = generator(f"Question: {query}\nContext: {context}")[0]['generated_text'] | |
| return response | |
| def main(): | |
| st.title("RAG PDF Q&A") | |
| st.write("This is a helpful AI tutor, and its prime responsibility is to explain concepts to students.") | |
| uploaded_file = st.file_uploader("Upload a PDF", type="pdf") | |
| if uploaded_file: | |
| with open("temp.pdf", "wb") as f: | |
| f.write(uploaded_file.getbuffer()) | |
| with st.spinner("Processing PDF..."): | |
| text = extract_text_from_pdf("temp.pdf") | |
| if text is None: | |
| return | |
| chunks = chunk_text(text) | |
| collection, embedding_model = store_in_vector_db(chunks) | |
| st.success("PDF processed successfully!") | |
| query = st.text_input("Ask a question about the PDF:") | |
| if query: | |
| with st.spinner("Generating response..."): | |
| response = retrieve_and_generate(query, collection, embedding_model) | |
| st.text_area("Response", value=response, height=200) | |
| if __name__ == "__main__": | |
| main() |