Spaces:
Sleeping
Sleeping
File size: 5,112 Bytes
b06d945 6b569cb b06d945 6b569cb b06d945 6b569cb b06d945 6b569cb b06d945 6b569cb b06d945 6b569cb b06d945 6b569cb b06d945 6b569cb b06d945 6b569cb b06d945 6b569cb b06d945 6b569cb b06d945 6b569cb b06d945 6b569cb b06d945 6b569cb b06d945 6b569cb b06d945 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 | import os
import pickle
import numpy as np
from aimakerspace.text_utils import CharacterTextSplitter, PDFLoader
from aimakerspace.openai_utils.embedding import EmbeddingModel
from aimakerspace.vectordatabase import VectorDatabase
import asyncio
import dotenv
import glob
# Load .env file if it exists
dotenv.load_dotenv()
# Check if OPENAI_API_KEY is available
if not os.environ.get("OPENAI_API_KEY"):
print("ERROR: OPENAI_API_KEY environment variable is not set.")
print("Please either:")
print("1. Create a .env file with OPENAI_API_KEY=your_key_here")
print("2. Set the environment variable: export OPENAI_API_KEY=your_key_here")
exit(1)
async def preprocess_files():
# Get all PDF files from the data directory
data_dir = "data"
pdf_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir)
if f.lower().endswith('.pdf')]
if not pdf_files:
print("No PDF files found in the data directory!")
return
print(f"Found {len(pdf_files)} PDF files to process")
text_splitter = CharacterTextSplitter()
all_texts = []
all_metadata = []
# Load and process all PDF documents
for file_path in pdf_files:
print(f"\n--- Processing {file_path} ---")
loader = PDFLoader(file_path)
documents = loader.load_documents()
print(f" Loaded {len(documents)} pages from {os.path.basename(file_path)}")
# Debug: check a few pages to ensure they're different
if len(documents) > 1:
print(f" First page preview: {documents[0][:100]}...")
print(f" Second page preview: {documents[1][:100]}...")
# Get pages from each document and extract text chunks
for doc_idx, doc in enumerate(documents):
# Extract page number if available
try:
page_num = doc_idx + 1 # Use document index + 1 as page number
print(f" Processing page {page_num} ({len(doc)} chars)")
# Skip empty pages
if not doc.strip():
print(f" Skipping empty page {page_num}")
continue
texts = text_splitter.split_texts([doc])
print(f" Split into {len(texts)} chunks")
for chunk_idx, text in enumerate(texts):
all_texts.append(text)
# Store metadata with each chunk
all_metadata.append({
"filename": os.path.basename(file_path),
"page": page_num
})
# Print sample of first chunk per page
if chunk_idx == 0:
print(f" Sample chunk: {text[:50]}... [page: {page_num}]")
except Exception as e:
print(f" Error processing document: {e}")
print(f"\nExtracted {len(all_texts)} text chunks from all PDFs")
# Verify page distribution
page_counts = {}
for meta in all_metadata:
filename = meta["filename"]
page = meta["page"]
if filename not in page_counts:
page_counts[filename] = {}
if page not in page_counts[filename]:
page_counts[filename][page] = 0
page_counts[filename][page] += 1
print("\nPage distribution per file:")
for filename, pages in page_counts.items():
print(f" {filename}:")
for page, count in sorted(pages.items()):
print(f" Page {page}: {count} chunks")
print("\nCreating vector database with embeddings...")
# Create vector database with embeddings
vector_db = VectorDatabase()
vector_db = await vector_db.abuild_from_list(all_texts)
# Verify that vectors match texts
vector_keys = list(vector_db.vectors.keys())
print(f"Vector DB has {len(vector_keys)} entries")
print(f"Example key from vector DB: {vector_keys[0][:50]}...")
# Save the processed data with metadata
print("\nSaving preprocessed data...")
with open('data/preprocessed_data.pkl', 'wb') as f:
data_to_save = {
'texts': all_texts,
'vectors': dict(vector_db.vectors),
'metadata': all_metadata
}
pickle.dump(data_to_save, f)
# Verify data was saved correctly
print("Verifying saved data...")
with open('data/preprocessed_data.pkl', 'rb') as f:
loaded_data = pickle.load(f)
print(f"Saved {len(loaded_data['texts'])} texts, {len(loaded_data['vectors'])} vectors, and {len(loaded_data['metadata'])} metadata entries")
# Check a few metadata entries to confirm page numbers
print("\nMetadata sample (first 3 entries):")
for i in range(min(3, len(loaded_data['metadata']))):
print(f" {loaded_data['metadata'][i]}")
print("\nPreprocessing complete. Data saved to data/preprocessed_data.pkl")
if __name__ == "__main__":
asyncio.run(preprocess_files()) |