AB_Testing_RAG / preprocess.py
kamkol's picture
Fix PDFLoader to process pages individually for correct page numbering
6b569cb
import os
import pickle
import numpy as np
from aimakerspace.text_utils import CharacterTextSplitter, PDFLoader
from aimakerspace.openai_utils.embedding import EmbeddingModel
from aimakerspace.vectordatabase import VectorDatabase
import asyncio
import dotenv
import glob
# Load .env file if it exists
dotenv.load_dotenv()
# Check if OPENAI_API_KEY is available
if not os.environ.get("OPENAI_API_KEY"):
print("ERROR: OPENAI_API_KEY environment variable is not set.")
print("Please either:")
print("1. Create a .env file with OPENAI_API_KEY=your_key_here")
print("2. Set the environment variable: export OPENAI_API_KEY=your_key_here")
exit(1)
async def preprocess_files():
# Get all PDF files from the data directory
data_dir = "data"
pdf_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir)
if f.lower().endswith('.pdf')]
if not pdf_files:
print("No PDF files found in the data directory!")
return
print(f"Found {len(pdf_files)} PDF files to process")
text_splitter = CharacterTextSplitter()
all_texts = []
all_metadata = []
# Load and process all PDF documents
for file_path in pdf_files:
print(f"\n--- Processing {file_path} ---")
loader = PDFLoader(file_path)
documents = loader.load_documents()
print(f" Loaded {len(documents)} pages from {os.path.basename(file_path)}")
# Debug: check a few pages to ensure they're different
if len(documents) > 1:
print(f" First page preview: {documents[0][:100]}...")
print(f" Second page preview: {documents[1][:100]}...")
# Get pages from each document and extract text chunks
for doc_idx, doc in enumerate(documents):
# Extract page number if available
try:
page_num = doc_idx + 1 # Use document index + 1 as page number
print(f" Processing page {page_num} ({len(doc)} chars)")
# Skip empty pages
if not doc.strip():
print(f" Skipping empty page {page_num}")
continue
texts = text_splitter.split_texts([doc])
print(f" Split into {len(texts)} chunks")
for chunk_idx, text in enumerate(texts):
all_texts.append(text)
# Store metadata with each chunk
all_metadata.append({
"filename": os.path.basename(file_path),
"page": page_num
})
# Print sample of first chunk per page
if chunk_idx == 0:
print(f" Sample chunk: {text[:50]}... [page: {page_num}]")
except Exception as e:
print(f" Error processing document: {e}")
print(f"\nExtracted {len(all_texts)} text chunks from all PDFs")
# Verify page distribution
page_counts = {}
for meta in all_metadata:
filename = meta["filename"]
page = meta["page"]
if filename not in page_counts:
page_counts[filename] = {}
if page not in page_counts[filename]:
page_counts[filename][page] = 0
page_counts[filename][page] += 1
print("\nPage distribution per file:")
for filename, pages in page_counts.items():
print(f" {filename}:")
for page, count in sorted(pages.items()):
print(f" Page {page}: {count} chunks")
print("\nCreating vector database with embeddings...")
# Create vector database with embeddings
vector_db = VectorDatabase()
vector_db = await vector_db.abuild_from_list(all_texts)
# Verify that vectors match texts
vector_keys = list(vector_db.vectors.keys())
print(f"Vector DB has {len(vector_keys)} entries")
print(f"Example key from vector DB: {vector_keys[0][:50]}...")
# Save the processed data with metadata
print("\nSaving preprocessed data...")
with open('data/preprocessed_data.pkl', 'wb') as f:
data_to_save = {
'texts': all_texts,
'vectors': dict(vector_db.vectors),
'metadata': all_metadata
}
pickle.dump(data_to_save, f)
# Verify data was saved correctly
print("Verifying saved data...")
with open('data/preprocessed_data.pkl', 'rb') as f:
loaded_data = pickle.load(f)
print(f"Saved {len(loaded_data['texts'])} texts, {len(loaded_data['vectors'])} vectors, and {len(loaded_data['metadata'])} metadata entries")
# Check a few metadata entries to confirm page numbers
print("\nMetadata sample (first 3 entries):")
for i in range(min(3, len(loaded_data['metadata']))):
print(f" {loaded_data['metadata'][i]}")
print("\nPreprocessing complete. Data saved to data/preprocessed_data.pkl")
if __name__ == "__main__":
asyncio.run(preprocess_files())