myRAG2 / app.py
amasood's picture
Update app.py
3257624 verified
import os
import PyPDF2
import faiss
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel, GPT2LMHeadModel, GPT2Tokenizer
import streamlit as st
# Step 1: Setup - Initialize directories and models
# Create a folder to store uploaded files
os.makedirs("uploaded_pdfs", exist_ok=True)
# Initialize tokenizer and models for embeddings and GPT-2 generation
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")
gpt2_model.eval()
# Set pad_token to eos_token to resolve padding issue
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token
# Step 2: PDF Upload and Text Extraction
st.title("RAG App: PDF Search and Response Generation")
st.subheader("Upload PDF Files")
uploaded_files = st.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True)
pdf_texts = [] # Initialize pdf_texts outside the if block to avoid the NameError
if uploaded_files:
for uploaded_file in uploaded_files:
with open(f"uploaded_pdfs/{uploaded_file.name}", "wb") as f:
f.write(uploaded_file.getbuffer())
with open(f"uploaded_pdfs/{uploaded_file.name}", "rb") as f:
pdf_reader = PyPDF2.PdfReader(f)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
pdf_texts.append({"file_name": uploaded_file.name, "text": text})
st.write(f"Processed {len(pdf_texts)} PDF(s).")
st.write(f"Preview of the first file ({pdf_texts[0]['file_name']}):")
st.write(pdf_texts[0]["text"][:500])
else:
st.write("No PDFs uploaded yet. Please upload a PDF to process.")
# Step 3: Chunking Text
def chunk_text(text, chunk_size=500, overlap=50):
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
chunks.append(text[start:end])
start = end - overlap
return chunks
chunked_data = []
if pdf_texts: # Only proceed if pdf_texts is populated
for pdf in pdf_texts:
file_name = pdf["file_name"]
text = pdf["text"]
chunks = chunk_text(text, chunk_size=500, overlap=50)
chunked_data.append({"file_name": file_name, "chunks": chunks})
st.write("Text chunking complete.")
st.write(f"Number of chunks for first file: {len(chunked_data[0]['chunks'])}")
else:
st.write("No text available for chunking. Please upload a PDF.")
# Step 4: Generate Embeddings
def generate_embeddings(chunks):
embeddings = []
for chunk in chunks:
inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
return embeddings
# Generate embeddings for the chunked data
if chunked_data: # Only generate embeddings if chunked_data is populated
for pdf in chunked_data:
pdf["embeddings"] = generate_embeddings(pdf["chunks"])
st.write("Embeddings generated successfully.")
else:
st.write("No chunks available for embeddings. Please upload and process a PDF.")
# Step 5: Query and Generate Response Using FAISS
def generate_query_embedding(query):
# Generate query embedding using the same model (sentence-transformers) used for chunk embeddings
inputs = tokenizer(query, return_tensors="pt", truncation=True, padding=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
query_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
return query_embedding
def generate_response(query, faiss_index, embeddings, text_data):
# Generate query embedding using sentence-transformers model
query_embedding = generate_query_embedding(query)
# Ensure the query embedding is of the same dimensionality as the embeddings in the index
query_embedding_flat = query_embedding.flatten().reshape(1, -1)
# Perform similarity search in FAISS index
_, indices = faiss_index.search(query_embedding_flat, k=3)
# Retrieve the closest chunks
closest_chunks = [text_data[i] for i in indices[0]]
context = " ".join(closest_chunks)
# Create a prompt for GPT-2
prompt = f"Answer the following question based on the provided context:\n\nContext: {context}\n\nQuestion: {query}\nAnswer:"
# Generate a response using GPT-2
input_ids = gpt2_tokenizer.encode(prompt, return_tensors="pt")
# Use max_new_tokens instead of max_length to avoid the input length conflict
outputs = gpt2_model.generate(input_ids, max_new_tokens=200, num_return_sequences=1, no_repeat_ngram_size=2)
# Decode the output
response = gpt2_tokenizer.decode(outputs[0], skip_special_tokens=True)
return response
# Step 6: Test the Query
query = st.text_input("Enter your query:")
if query:
text_data = [chunk for pdf in chunked_data for chunk in pdf["chunks"]]
embeddings = np.vstack([pdf["embeddings"] for pdf in chunked_data])
dimension = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(embeddings.astype(np.float32))
response = generate_response(query, faiss_index, embeddings, text_data)
st.write("Generated Response:")
st.write(response)