import os import pickle import numpy as np from aimakerspace.text_utils import CharacterTextSplitter, PDFLoader from aimakerspace.openai_utils.embedding import EmbeddingModel from aimakerspace.vectordatabase import VectorDatabase import asyncio import dotenv import glob # Load .env file if it exists dotenv.load_dotenv() # Check if OPENAI_API_KEY is available if not os.environ.get("OPENAI_API_KEY"): print("ERROR: OPENAI_API_KEY environment variable is not set.") print("Please either:") print("1. Create a .env file with OPENAI_API_KEY=your_key_here") print("2. Set the environment variable: export OPENAI_API_KEY=your_key_here") exit(1) async def preprocess_files(): # Get all PDF files from the data directory data_dir = "data" pdf_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.lower().endswith('.pdf')] if not pdf_files: print("No PDF files found in the data directory!") return print(f"Found {len(pdf_files)} PDF files to process") text_splitter = CharacterTextSplitter() all_texts = [] all_metadata = [] # Load and process all PDF documents for file_path in pdf_files: print(f"\n--- Processing {file_path} ---") loader = PDFLoader(file_path) documents = loader.load_documents() print(f" Loaded {len(documents)} pages from {os.path.basename(file_path)}") # Debug: check a few pages to ensure they're different if len(documents) > 1: print(f" First page preview: {documents[0][:100]}...") print(f" Second page preview: {documents[1][:100]}...") # Get pages from each document and extract text chunks for doc_idx, doc in enumerate(documents): # Extract page number if available try: page_num = doc_idx + 1 # Use document index + 1 as page number print(f" Processing page {page_num} ({len(doc)} chars)") # Skip empty pages if not doc.strip(): print(f" Skipping empty page {page_num}") continue texts = text_splitter.split_texts([doc]) print(f" Split into {len(texts)} chunks") for chunk_idx, text in enumerate(texts): all_texts.append(text) # Store metadata with each chunk all_metadata.append({ "filename": os.path.basename(file_path), "page": page_num }) # Print sample of first chunk per page if chunk_idx == 0: print(f" Sample chunk: {text[:50]}... [page: {page_num}]") except Exception as e: print(f" Error processing document: {e}") print(f"\nExtracted {len(all_texts)} text chunks from all PDFs") # Verify page distribution page_counts = {} for meta in all_metadata: filename = meta["filename"] page = meta["page"] if filename not in page_counts: page_counts[filename] = {} if page not in page_counts[filename]: page_counts[filename][page] = 0 page_counts[filename][page] += 1 print("\nPage distribution per file:") for filename, pages in page_counts.items(): print(f" {filename}:") for page, count in sorted(pages.items()): print(f" Page {page}: {count} chunks") print("\nCreating vector database with embeddings...") # Create vector database with embeddings vector_db = VectorDatabase() vector_db = await vector_db.abuild_from_list(all_texts) # Verify that vectors match texts vector_keys = list(vector_db.vectors.keys()) print(f"Vector DB has {len(vector_keys)} entries") print(f"Example key from vector DB: {vector_keys[0][:50]}...") # Save the processed data with metadata print("\nSaving preprocessed data...") with open('data/preprocessed_data.pkl', 'wb') as f: data_to_save = { 'texts': all_texts, 'vectors': dict(vector_db.vectors), 'metadata': all_metadata } pickle.dump(data_to_save, f) # Verify data was saved correctly print("Verifying saved data...") with open('data/preprocessed_data.pkl', 'rb') as f: loaded_data = pickle.load(f) print(f"Saved {len(loaded_data['texts'])} texts, {len(loaded_data['vectors'])} vectors, and {len(loaded_data['metadata'])} metadata entries") # Check a few metadata entries to confirm page numbers print("\nMetadata sample (first 3 entries):") for i in range(min(3, len(loaded_data['metadata']))): print(f" {loaded_data['metadata'][i]}") print("\nPreprocessing complete. Data saved to data/preprocessed_data.pkl") if __name__ == "__main__": asyncio.run(preprocess_files())