Spaces:

kamkol
/

AB_Testing_RAG

Sleeping

App Files Files Community

AB_Testing_RAG / preprocess.py

kamkol

Fix PDFLoader to process pages individually for correct page numbering

6b569cb 11 months ago

raw

history blame contribute delete

5.11 kB

	import os
	import pickle
	import numpy as np
	from aimakerspace.text_utils import CharacterTextSplitter, PDFLoader
	from aimakerspace.openai_utils.embedding import EmbeddingModel
	from aimakerspace.vectordatabase import VectorDatabase
	import asyncio
	import dotenv
	import glob

	# Load .env file if it exists
	dotenv.load_dotenv()

	# Check if OPENAI_API_KEY is available
	if not os.environ.get("OPENAI_API_KEY"):
	print("ERROR: OPENAI_API_KEY environment variable is not set.")
	print("Please either:")
	print("1. Create a .env file with OPENAI_API_KEY=your_key_here")
	print("2. Set the environment variable: export OPENAI_API_KEY=your_key_here")
	exit(1)

	async def preprocess_files():
	# Get all PDF files from the data directory
	data_dir = "data"
	pdf_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir)
	if f.lower().endswith('.pdf')]

	if not pdf_files:
	print("No PDF files found in the data directory!")
	return

	print(f"Found {len(pdf_files)} PDF files to process")

	text_splitter = CharacterTextSplitter()
	all_texts = []
	all_metadata = []

	# Load and process all PDF documents
	for file_path in pdf_files:
	print(f"\n--- Processing {file_path} ---")
	loader = PDFLoader(file_path)
	documents = loader.load_documents()

	print(f" Loaded {len(documents)} pages from {os.path.basename(file_path)}")

	# Debug: check a few pages to ensure they're different
	if len(documents) > 1:
	print(f" First page preview: {documents[0][:100]}...")
	print(f" Second page preview: {documents[1][:100]}...")

	# Get pages from each document and extract text chunks
	for doc_idx, doc in enumerate(documents):
	# Extract page number if available
	try:
	page_num = doc_idx + 1 # Use document index + 1 as page number
	print(f" Processing page {page_num} ({len(doc)} chars)")

	# Skip empty pages
	if not doc.strip():
	print(f" Skipping empty page {page_num}")
	continue

	texts = text_splitter.split_texts([doc])
	print(f" Split into {len(texts)} chunks")

	for chunk_idx, text in enumerate(texts):
	all_texts.append(text)
	# Store metadata with each chunk
	all_metadata.append({
	"filename": os.path.basename(file_path),
	"page": page_num
	})

	# Print sample of first chunk per page
	if chunk_idx == 0:
	print(f" Sample chunk: {text[:50]}... [page: {page_num}]")
	except Exception as e:
	print(f" Error processing document: {e}")

	print(f"\nExtracted {len(all_texts)} text chunks from all PDFs")

	# Verify page distribution
	page_counts = {}
	for meta in all_metadata:
	filename = meta["filename"]
	page = meta["page"]
	if filename not in page_counts:
	page_counts[filename] = {}

	if page not in page_counts[filename]:
	page_counts[filename][page] = 0

	page_counts[filename][page] += 1

	print("\nPage distribution per file:")
	for filename, pages in page_counts.items():
	print(f" {filename}:")
	for page, count in sorted(pages.items()):
	print(f" Page {page}: {count} chunks")

	print("\nCreating vector database with embeddings...")
	# Create vector database with embeddings
	vector_db = VectorDatabase()
	vector_db = await vector_db.abuild_from_list(all_texts)

	# Verify that vectors match texts
	vector_keys = list(vector_db.vectors.keys())
	print(f"Vector DB has {len(vector_keys)} entries")
	print(f"Example key from vector DB: {vector_keys[0][:50]}...")

	# Save the processed data with metadata
	print("\nSaving preprocessed data...")
	with open('data/preprocessed_data.pkl', 'wb') as f:
	data_to_save = {
	'texts': all_texts,
	'vectors': dict(vector_db.vectors),
	'metadata': all_metadata
	}
	pickle.dump(data_to_save, f)

	# Verify data was saved correctly
	print("Verifying saved data...")
	with open('data/preprocessed_data.pkl', 'rb') as f:
	loaded_data = pickle.load(f)

	print(f"Saved {len(loaded_data['texts'])} texts, {len(loaded_data['vectors'])} vectors, and {len(loaded_data['metadata'])} metadata entries")

	# Check a few metadata entries to confirm page numbers
	print("\nMetadata sample (first 3 entries):")
	for i in range(min(3, len(loaded_data['metadata']))):
	print(f" {loaded_data['metadata'][i]}")

	print("\nPreprocessing complete. Data saved to data/preprocessed_data.pkl")

	if __name__ == "__main__":
	asyncio.run(preprocess_files())