Spaces:
Sleeping
Sleeping
| import os | |
| import pickle | |
| import numpy as np | |
| from aimakerspace.text_utils import CharacterTextSplitter, PDFLoader | |
| from aimakerspace.openai_utils.embedding import EmbeddingModel | |
| from aimakerspace.vectordatabase import VectorDatabase | |
| import asyncio | |
| import dotenv | |
| import glob | |
| # Load .env file if it exists | |
| dotenv.load_dotenv() | |
| # Check if OPENAI_API_KEY is available | |
| if not os.environ.get("OPENAI_API_KEY"): | |
| print("ERROR: OPENAI_API_KEY environment variable is not set.") | |
| print("Please either:") | |
| print("1. Create a .env file with OPENAI_API_KEY=your_key_here") | |
| print("2. Set the environment variable: export OPENAI_API_KEY=your_key_here") | |
| exit(1) | |
| async def preprocess_files(): | |
| # Get all PDF files from the data directory | |
| data_dir = "data" | |
| pdf_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) | |
| if f.lower().endswith('.pdf')] | |
| if not pdf_files: | |
| print("No PDF files found in the data directory!") | |
| return | |
| print(f"Found {len(pdf_files)} PDF files to process") | |
| text_splitter = CharacterTextSplitter() | |
| all_texts = [] | |
| all_metadata = [] | |
| # Load and process all PDF documents | |
| for file_path in pdf_files: | |
| print(f"\n--- Processing {file_path} ---") | |
| loader = PDFLoader(file_path) | |
| documents = loader.load_documents() | |
| print(f" Loaded {len(documents)} pages from {os.path.basename(file_path)}") | |
| # Debug: check a few pages to ensure they're different | |
| if len(documents) > 1: | |
| print(f" First page preview: {documents[0][:100]}...") | |
| print(f" Second page preview: {documents[1][:100]}...") | |
| # Get pages from each document and extract text chunks | |
| for doc_idx, doc in enumerate(documents): | |
| # Extract page number if available | |
| try: | |
| page_num = doc_idx + 1 # Use document index + 1 as page number | |
| print(f" Processing page {page_num} ({len(doc)} chars)") | |
| # Skip empty pages | |
| if not doc.strip(): | |
| print(f" Skipping empty page {page_num}") | |
| continue | |
| texts = text_splitter.split_texts([doc]) | |
| print(f" Split into {len(texts)} chunks") | |
| for chunk_idx, text in enumerate(texts): | |
| all_texts.append(text) | |
| # Store metadata with each chunk | |
| all_metadata.append({ | |
| "filename": os.path.basename(file_path), | |
| "page": page_num | |
| }) | |
| # Print sample of first chunk per page | |
| if chunk_idx == 0: | |
| print(f" Sample chunk: {text[:50]}... [page: {page_num}]") | |
| except Exception as e: | |
| print(f" Error processing document: {e}") | |
| print(f"\nExtracted {len(all_texts)} text chunks from all PDFs") | |
| # Verify page distribution | |
| page_counts = {} | |
| for meta in all_metadata: | |
| filename = meta["filename"] | |
| page = meta["page"] | |
| if filename not in page_counts: | |
| page_counts[filename] = {} | |
| if page not in page_counts[filename]: | |
| page_counts[filename][page] = 0 | |
| page_counts[filename][page] += 1 | |
| print("\nPage distribution per file:") | |
| for filename, pages in page_counts.items(): | |
| print(f" {filename}:") | |
| for page, count in sorted(pages.items()): | |
| print(f" Page {page}: {count} chunks") | |
| print("\nCreating vector database with embeddings...") | |
| # Create vector database with embeddings | |
| vector_db = VectorDatabase() | |
| vector_db = await vector_db.abuild_from_list(all_texts) | |
| # Verify that vectors match texts | |
| vector_keys = list(vector_db.vectors.keys()) | |
| print(f"Vector DB has {len(vector_keys)} entries") | |
| print(f"Example key from vector DB: {vector_keys[0][:50]}...") | |
| # Save the processed data with metadata | |
| print("\nSaving preprocessed data...") | |
| with open('data/preprocessed_data.pkl', 'wb') as f: | |
| data_to_save = { | |
| 'texts': all_texts, | |
| 'vectors': dict(vector_db.vectors), | |
| 'metadata': all_metadata | |
| } | |
| pickle.dump(data_to_save, f) | |
| # Verify data was saved correctly | |
| print("Verifying saved data...") | |
| with open('data/preprocessed_data.pkl', 'rb') as f: | |
| loaded_data = pickle.load(f) | |
| print(f"Saved {len(loaded_data['texts'])} texts, {len(loaded_data['vectors'])} vectors, and {len(loaded_data['metadata'])} metadata entries") | |
| # Check a few metadata entries to confirm page numbers | |
| print("\nMetadata sample (first 3 entries):") | |
| for i in range(min(3, len(loaded_data['metadata']))): | |
| print(f" {loaded_data['metadata'][i]}") | |
| print("\nPreprocessing complete. Data saved to data/preprocessed_data.pkl") | |
| if __name__ == "__main__": | |
| asyncio.run(preprocess_files()) |