| | import os |
| | import PyPDF2 |
| | import faiss |
| | import torch |
| | import numpy as np |
| | from transformers import AutoTokenizer, AutoModel, GPT2LMHeadModel, GPT2Tokenizer |
| | import streamlit as st |
| |
|
| | |
| |
|
| | |
| | os.makedirs("uploaded_pdfs", exist_ok=True) |
| |
|
| | |
| | tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") |
| | model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") |
| | gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2") |
| | gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2") |
| | gpt2_model.eval() |
| |
|
| | |
| | gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token |
| |
|
| | |
| | st.title("RAG App: PDF Search and Response Generation") |
| |
|
| | st.subheader("Upload PDF Files") |
| | uploaded_files = st.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True) |
| |
|
| | pdf_texts = [] |
| |
|
| | if uploaded_files: |
| | for uploaded_file in uploaded_files: |
| | with open(f"uploaded_pdfs/{uploaded_file.name}", "wb") as f: |
| | f.write(uploaded_file.getbuffer()) |
| | |
| | with open(f"uploaded_pdfs/{uploaded_file.name}", "rb") as f: |
| | pdf_reader = PyPDF2.PdfReader(f) |
| | text = "" |
| | for page in pdf_reader.pages: |
| | text += page.extract_text() |
| | pdf_texts.append({"file_name": uploaded_file.name, "text": text}) |
| | |
| | st.write(f"Processed {len(pdf_texts)} PDF(s).") |
| | st.write(f"Preview of the first file ({pdf_texts[0]['file_name']}):") |
| | st.write(pdf_texts[0]["text"][:500]) |
| |
|
| | else: |
| | st.write("No PDFs uploaded yet. Please upload a PDF to process.") |
| |
|
| | |
| | def chunk_text(text, chunk_size=500, overlap=50): |
| | chunks = [] |
| | start = 0 |
| | while start < len(text): |
| | end = start + chunk_size |
| | chunks.append(text[start:end]) |
| | start = end - overlap |
| | return chunks |
| |
|
| | chunked_data = [] |
| |
|
| | if pdf_texts: |
| | for pdf in pdf_texts: |
| | file_name = pdf["file_name"] |
| | text = pdf["text"] |
| | chunks = chunk_text(text, chunk_size=500, overlap=50) |
| | chunked_data.append({"file_name": file_name, "chunks": chunks}) |
| | |
| | st.write("Text chunking complete.") |
| | st.write(f"Number of chunks for first file: {len(chunked_data[0]['chunks'])}") |
| |
|
| | else: |
| | st.write("No text available for chunking. Please upload a PDF.") |
| |
|
| | |
| | def generate_embeddings(chunks): |
| | embeddings = [] |
| | for chunk in chunks: |
| | inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512) |
| | with torch.no_grad(): |
| | outputs = model(**inputs) |
| | embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy()) |
| | return embeddings |
| |
|
| | |
| | if chunked_data: |
| | for pdf in chunked_data: |
| | pdf["embeddings"] = generate_embeddings(pdf["chunks"]) |
| |
|
| | st.write("Embeddings generated successfully.") |
| |
|
| | else: |
| | st.write("No chunks available for embeddings. Please upload and process a PDF.") |
| |
|
| | |
| | def generate_query_embedding(query): |
| | |
| | inputs = tokenizer(query, return_tensors="pt", truncation=True, padding=True, max_length=512) |
| | with torch.no_grad(): |
| | outputs = model(**inputs) |
| | query_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy() |
| | return query_embedding |
| |
|
| | def generate_response(query, faiss_index, embeddings, text_data): |
| | |
| | query_embedding = generate_query_embedding(query) |
| |
|
| | |
| | query_embedding_flat = query_embedding.flatten().reshape(1, -1) |
| | |
| | |
| | _, indices = faiss_index.search(query_embedding_flat, k=3) |
| |
|
| | |
| | closest_chunks = [text_data[i] for i in indices[0]] |
| | context = " ".join(closest_chunks) |
| | |
| | |
| | prompt = f"Answer the following question based on the provided context:\n\nContext: {context}\n\nQuestion: {query}\nAnswer:" |
| |
|
| | |
| | input_ids = gpt2_tokenizer.encode(prompt, return_tensors="pt") |
| | |
| | |
| | outputs = gpt2_model.generate(input_ids, max_new_tokens=200, num_return_sequences=1, no_repeat_ngram_size=2) |
| | |
| | |
| | response = gpt2_tokenizer.decode(outputs[0], skip_special_tokens=True) |
| | |
| | return response |
| |
|
| |
|
| | |
| | query = st.text_input("Enter your query:") |
| | if query: |
| | text_data = [chunk for pdf in chunked_data for chunk in pdf["chunks"]] |
| | embeddings = np.vstack([pdf["embeddings"] for pdf in chunked_data]) |
| |
|
| | dimension = embeddings.shape[1] |
| | faiss_index = faiss.IndexFlatL2(dimension) |
| | faiss_index.add(embeddings.astype(np.float32)) |
| |
|
| | response = generate_response(query, faiss_index, embeddings, text_data) |
| | st.write("Generated Response:") |
| | st.write(response) |
| |
|