File size: 5,112 Bytes
b06d945
 
 
 
 
 
 
6b569cb
 
 
 
 
 
 
 
 
 
 
 
 
b06d945
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b569cb
b06d945
 
 
6b569cb
 
 
 
 
 
 
b06d945
 
 
 
6b569cb
 
 
 
 
 
 
 
b06d945
6b569cb
b06d945
6b569cb
b06d945
 
 
 
 
 
6b569cb
 
 
 
b06d945
6b569cb
 
 
b06d945
6b569cb
 
 
 
 
 
 
 
 
 
 
 
b06d945
6b569cb
 
 
 
 
 
 
b06d945
 
 
 
6b569cb
 
 
 
 
b06d945
6b569cb
b06d945
6b569cb
b06d945
 
 
6b569cb
 
 
 
 
 
 
 
 
 
 
 
 
 
b06d945
6b569cb
b06d945
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import os
import pickle
import numpy as np
from aimakerspace.text_utils import CharacterTextSplitter, PDFLoader
from aimakerspace.openai_utils.embedding import EmbeddingModel
from aimakerspace.vectordatabase import VectorDatabase
import asyncio
import dotenv
import glob

# Load .env file if it exists
dotenv.load_dotenv()

# Check if OPENAI_API_KEY is available
if not os.environ.get("OPENAI_API_KEY"):
    print("ERROR: OPENAI_API_KEY environment variable is not set.")
    print("Please either:")
    print("1. Create a .env file with OPENAI_API_KEY=your_key_here")
    print("2. Set the environment variable: export OPENAI_API_KEY=your_key_here")
    exit(1)

async def preprocess_files():
    # Get all PDF files from the data directory
    data_dir = "data"
    pdf_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) 
                 if f.lower().endswith('.pdf')]
    
    if not pdf_files:
        print("No PDF files found in the data directory!")
        return
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    text_splitter = CharacterTextSplitter()
    all_texts = []
    all_metadata = []
    
    # Load and process all PDF documents
    for file_path in pdf_files:
        print(f"\n--- Processing {file_path} ---")
        loader = PDFLoader(file_path)
        documents = loader.load_documents()
        
        print(f"  Loaded {len(documents)} pages from {os.path.basename(file_path)}")
        
        # Debug: check a few pages to ensure they're different
        if len(documents) > 1:
            print(f"  First page preview: {documents[0][:100]}...")
            print(f"  Second page preview: {documents[1][:100]}...")
        
        # Get pages from each document and extract text chunks
        for doc_idx, doc in enumerate(documents):
            # Extract page number if available
            try:
                page_num = doc_idx + 1  # Use document index + 1 as page number
                print(f"  Processing page {page_num} ({len(doc)} chars)")
                
                # Skip empty pages
                if not doc.strip():
                    print(f"  Skipping empty page {page_num}")
                    continue
                
                texts = text_splitter.split_texts([doc])
                print(f"  Split into {len(texts)} chunks")
                
                for chunk_idx, text in enumerate(texts):
                    all_texts.append(text)
                    # Store metadata with each chunk
                    all_metadata.append({
                        "filename": os.path.basename(file_path),
                        "page": page_num
                    })
                    
                    # Print sample of first chunk per page
                    if chunk_idx == 0:
                        print(f"  Sample chunk: {text[:50]}... [page: {page_num}]")
            except Exception as e:
                print(f"  Error processing document: {e}")
    
    print(f"\nExtracted {len(all_texts)} text chunks from all PDFs")
    
    # Verify page distribution
    page_counts = {}
    for meta in all_metadata:
        filename = meta["filename"]
        page = meta["page"]
        if filename not in page_counts:
            page_counts[filename] = {}
        
        if page not in page_counts[filename]:
            page_counts[filename][page] = 0
        
        page_counts[filename][page] += 1
    
    print("\nPage distribution per file:")
    for filename, pages in page_counts.items():
        print(f"  {filename}:")
        for page, count in sorted(pages.items()):
            print(f"    Page {page}: {count} chunks")
    
    print("\nCreating vector database with embeddings...")
    # Create vector database with embeddings
    vector_db = VectorDatabase()
    vector_db = await vector_db.abuild_from_list(all_texts)
    
    # Verify that vectors match texts
    vector_keys = list(vector_db.vectors.keys())
    print(f"Vector DB has {len(vector_keys)} entries")
    print(f"Example key from vector DB: {vector_keys[0][:50]}...")
    
    # Save the processed data with metadata
    print("\nSaving preprocessed data...")
    with open('data/preprocessed_data.pkl', 'wb') as f:
        data_to_save = {
            'texts': all_texts,
            'vectors': dict(vector_db.vectors),
            'metadata': all_metadata
        }
        pickle.dump(data_to_save, f)
    
    # Verify data was saved correctly
    print("Verifying saved data...")
    with open('data/preprocessed_data.pkl', 'rb') as f:
        loaded_data = pickle.load(f)
        
    print(f"Saved {len(loaded_data['texts'])} texts, {len(loaded_data['vectors'])} vectors, and {len(loaded_data['metadata'])} metadata entries")
    
    # Check a few metadata entries to confirm page numbers
    print("\nMetadata sample (first 3 entries):")
    for i in range(min(3, len(loaded_data['metadata']))):
        print(f"  {loaded_data['metadata'][i]}")
    
    print("\nPreprocessing complete. Data saved to data/preprocessed_data.pkl")

if __name__ == "__main__":
    asyncio.run(preprocess_files())