import os import PyPDF2 import faiss import torch import numpy as np from transformers import AutoTokenizer, AutoModel, GPT2LMHeadModel, GPT2Tokenizer import streamlit as st # Step 1: Setup - Initialize directories and models # Create a folder to store uploaded files os.makedirs("uploaded_pdfs", exist_ok=True) # Initialize tokenizer and models for embeddings and GPT-2 generation tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2") gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2") gpt2_model.eval() # Set pad_token to eos_token to resolve padding issue gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token # Step 2: PDF Upload and Text Extraction st.title("RAG App: PDF Search and Response Generation") st.subheader("Upload PDF Files") uploaded_files = st.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True) pdf_texts = [] # Initialize pdf_texts outside the if block to avoid the NameError if uploaded_files: for uploaded_file in uploaded_files: with open(f"uploaded_pdfs/{uploaded_file.name}", "wb") as f: f.write(uploaded_file.getbuffer()) with open(f"uploaded_pdfs/{uploaded_file.name}", "rb") as f: pdf_reader = PyPDF2.PdfReader(f) text = "" for page in pdf_reader.pages: text += page.extract_text() pdf_texts.append({"file_name": uploaded_file.name, "text": text}) st.write(f"Processed {len(pdf_texts)} PDF(s).") st.write(f"Preview of the first file ({pdf_texts[0]['file_name']}):") st.write(pdf_texts[0]["text"][:500]) else: st.write("No PDFs uploaded yet. Please upload a PDF to process.") # Step 3: Chunking Text def chunk_text(text, chunk_size=500, overlap=50): chunks = [] start = 0 while start < len(text): end = start + chunk_size chunks.append(text[start:end]) start = end - overlap return chunks chunked_data = [] if pdf_texts: # Only proceed if pdf_texts is populated for pdf in pdf_texts: file_name = pdf["file_name"] text = pdf["text"] chunks = chunk_text(text, chunk_size=500, overlap=50) chunked_data.append({"file_name": file_name, "chunks": chunks}) st.write("Text chunking complete.") st.write(f"Number of chunks for first file: {len(chunked_data[0]['chunks'])}") else: st.write("No text available for chunking. Please upload a PDF.") # Step 4: Generate Embeddings def generate_embeddings(chunks): embeddings = [] for chunk in chunks: inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512) with torch.no_grad(): outputs = model(**inputs) embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy()) return embeddings # Generate embeddings for the chunked data if chunked_data: # Only generate embeddings if chunked_data is populated for pdf in chunked_data: pdf["embeddings"] = generate_embeddings(pdf["chunks"]) st.write("Embeddings generated successfully.") else: st.write("No chunks available for embeddings. Please upload and process a PDF.") # Step 5: Query and Generate Response Using FAISS def generate_query_embedding(query): # Generate query embedding using the same model (sentence-transformers) used for chunk embeddings inputs = tokenizer(query, return_tensors="pt", truncation=True, padding=True, max_length=512) with torch.no_grad(): outputs = model(**inputs) query_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy() return query_embedding def generate_response(query, faiss_index, embeddings, text_data): # Generate query embedding using sentence-transformers model query_embedding = generate_query_embedding(query) # Ensure the query embedding is of the same dimensionality as the embeddings in the index query_embedding_flat = query_embedding.flatten().reshape(1, -1) # Perform similarity search in FAISS index _, indices = faiss_index.search(query_embedding_flat, k=3) # Retrieve the closest chunks closest_chunks = [text_data[i] for i in indices[0]] context = " ".join(closest_chunks) # Create a prompt for GPT-2 prompt = f"Answer the following question based on the provided context:\n\nContext: {context}\n\nQuestion: {query}\nAnswer:" # Generate a response using GPT-2 input_ids = gpt2_tokenizer.encode(prompt, return_tensors="pt") # Use max_new_tokens instead of max_length to avoid the input length conflict outputs = gpt2_model.generate(input_ids, max_new_tokens=200, num_return_sequences=1, no_repeat_ngram_size=2) # Decode the output response = gpt2_tokenizer.decode(outputs[0], skip_special_tokens=True) return response # Step 6: Test the Query query = st.text_input("Enter your query:") if query: text_data = [chunk for pdf in chunked_data for chunk in pdf["chunks"]] embeddings = np.vstack([pdf["embeddings"] for pdf in chunked_data]) dimension = embeddings.shape[1] faiss_index = faiss.IndexFlatL2(dimension) faiss_index.add(embeddings.astype(np.float32)) response = generate_response(query, faiss_index, embeddings, text_data) st.write("Generated Response:") st.write(response)