File size: 6,802 Bytes
1307867
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import os
from collections import defaultdict
import tiktoken
import pickle
import shutil
import json
import time
import numpy as np
from pathlib import Path

from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Qdrant
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams

from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Ensure OpenAI API key is available
if not os.environ.get("OPENAI_API_KEY"):
    raise ValueError("OPENAI_API_KEY environment variable is not set.")

def tiktoken_len(text):
    """Count tokens using the gpt-4o-mini tokenizer"""
    tokens = tiktoken.encoding_for_model("gpt-4o-mini").encode(text)
    return len(tokens)

def add_page_info_to_splits(splits):
    """Process splits to add page info based on character position"""
    for split in splits:
        # Get the start position of this chunk
        start_pos = split.metadata.get("start_index", 0)
        end_pos = start_pos + len(split.page_content)
        
        # Find which page this chunk belongs to
        if "page_ranges" in split.metadata:
            for page_range in split.metadata["page_ranges"]:
                # If chunk significantly overlaps with this page range
                if (start_pos <= page_range["end"] and 
                    end_pos >= page_range["start"]):
                    # Use this page number
                    split.metadata["page"] = page_range["page"]
                    break
    return splits

def clean_directory(directory_path):
    """Clean a directory by removing all files and subdirectories"""
    path = Path(directory_path)
    if path.exists():
        print(f"Cleaning directory: {directory_path}")
        shutil.rmtree(path)
    
    # Wait a moment to ensure OS releases the directory handles
    time.sleep(1)
    
    path.mkdir(parents=True, exist_ok=True)
    print(f"Created clean directory: {directory_path}")

def process_pdfs():
    """Process PDFs and create vectorstore"""
    print("Processing PDFs...")
    
    # Create processed data directory if it doesn't exist (clean it if it does)
    processed_data_dir = Path("data/processed_data")
    clean_directory(processed_data_dir)
    
    # Load all PDF documents (each page as a separate document)
    pdf_path = "notebook_version/data/"
    print(f"Loading PDFs from: {pdf_path}")
    
    loader = DirectoryLoader(pdf_path, glob="*.pdf", loader_cls=PyPDFLoader)
    all_docs = loader.load()
    
    print(f"Loaded {len(all_docs)} document pages.")
    
    # Create a mapping of merged document chunks back to original pages
    docs_by_source = defaultdict(list)
    
    # Group documents by their source file
    for doc in all_docs:
        source = doc.metadata.get("source", "")
        docs_by_source[source].append(doc)
    
    # Merge pages from the same PDF but track page ranges
    merged_docs = []
    for source, source_docs in docs_by_source.items():
        # Sort by page number if available
        source_docs.sort(key=lambda x: x.metadata.get("page", 0))
        
        # Get just the filename (no path)
        filename = os.path.basename(source)
        
        # Merge the content
        merged_content = ""
        page_ranges = []
        
        for doc in source_docs:
            # Get the page number (1-indexed for human readability)
            page_num = doc.metadata.get("page", 0) + 1
            
            # Add a separator between pages for clarity
            if merged_content:
                merged_content += "\n\n"
            
            # Record where this page's content starts in the merged document
            start_pos = len(merged_content)
            merged_content += doc.page_content
            end_pos = len(merged_content)
            
            # Store the mapping of character ranges to original page numbers
            page_ranges.append({
                "start": start_pos,
                "end": end_pos,
                "page": page_num,
                "source": filename
            })
        
        # Create merged metadata that includes page mapping information
        merged_metadata = {
            "source": filename,
            "title": filename,
            "page_count": len(source_docs),
            "merged": True,
            "page_ranges": page_ranges  # Store the page ranges for later reference
        }
        
        # Create a new document with the merged content
        merged_doc = Document(page_content=merged_content, metadata=merged_metadata)
        merged_docs.append(merged_doc)
    
    print(f"Created {len(merged_docs)} merged documents.")
    
    # Split documents
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=50,
        length_function=tiktoken_len,
        add_start_index=True
    )
    
    # Split and then process to add page information
    raw_splits = text_splitter.split_documents(merged_docs)
    split_chunks = add_page_info_to_splits(raw_splits)
    
    print(f"Created {len(split_chunks)} chunks.")
    
    # Save chunks for later use
    with open(processed_data_dir / "chunks.pkl", "wb") as f:
        pickle.dump(split_chunks, f)
    
    # Initialize embedding model
    embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
    
    print("Embedding document chunks (this may take a while)...")
    # Create a dictionary to store documents and their embeddings
    embedded_docs = []
    
    # Embed in batches to avoid API rate limits
    batch_size = 50
    for i in range(0, len(split_chunks), batch_size):
        batch = split_chunks[i:i+batch_size]
        
        # Extract text
        texts = [doc.page_content for doc in batch]
        
        # Get embeddings
        embeddings = embedding_model.embed_documents(texts)
        
        # Store with metadata
        for j, doc in enumerate(batch):
            embedded_docs.append({
                "id": i + j,
                "text": doc.page_content,
                "metadata": doc.metadata,
                "embedding": embeddings[j]
            })
        
        # Print progress
        print(f"Embedded {min(i+batch_size, len(split_chunks))}/{len(split_chunks)} chunks")
    
    # Save the embedded docs for later use
    with open(processed_data_dir / "embedded_docs.pkl", "wb") as f:
        pickle.dump(embedded_docs, f)
    
    print("Processing complete. All data saved to data/processed_data/")

if __name__ == "__main__":
    process_pdfs()