RAG-AI-Tutor / app.py
Shahbazakbar's picture
Update app.py
73698d1 verified
import PyPDF2
from sentence_transformers import SentenceTransformer
import chromadb
from transformers import pipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter
import streamlit as st
def extract_text_from_pdf(pdf_path):
try:
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
if reader.is_encrypted:
st.error("This PDF is encrypted. Encryption support is not available in this version.")
return None
text = ""
for page in reader.pages:
text += page.extract_text() or "" # Handle None from extract_text
return text
except Exception as e:
st.error(f"Error reading PDF: {str(e)}")
return None
def chunk_text(text):
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_text(text)
return chunks
def store_in_vector_db(chunks):
# Use EphemeralClient for in-memory, no tenant/database setup
client = chromadb.EphemeralClient()
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(chunks)
collection = client.create_collection("pdf_chunks")
collection.add(documents=chunks, embeddings=embeddings.tolist(), ids=[f"chunk_{i}" for i in range(len(chunks))])
return collection, model
def retrieve_and_generate(query, collection, embedding_model):
query_embedding = embedding_model.encode([query]).tolist()
results = collection.query(query_embeddings=query_embedding, n_results=3)
context = " ".join(results['documents'][0])
generator = pipeline('text-generation', model='facebook/bart-large', max_length=100)
response = generator(f"Question: {query}\nContext: {context}")[0]['generated_text']
return response
def main():
st.title("RAG PDF Q&A")
st.write("This is a helpful AI tutor, and its prime responsibility is to explain concepts to students.")
uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
if uploaded_file:
with open("temp.pdf", "wb") as f:
f.write(uploaded_file.getbuffer())
with st.spinner("Processing PDF..."):
text = extract_text_from_pdf("temp.pdf")
if text is None:
return
chunks = chunk_text(text)
collection, embedding_model = store_in_vector_db(chunks)
st.success("PDF processed successfully!")
query = st.text_input("Ask a question about the PDF:")
if query:
with st.spinner("Generating response..."):
response = retrieve_and_generate(query, collection, embedding_model)
st.text_area("Response", value=response, height=200)
if __name__ == "__main__":
main()