import os from collections import defaultdict import tiktoken import pickle import shutil import json import time import numpy as np from pathlib import Path from langchain_community.document_loaders import DirectoryLoader from langchain_community.document_loaders import PyPDFLoader from langchain_core.documents import Document from langchain.text_splitter import RecursiveCharacterTextSplitter from transformers import AutoModel, AutoTokenizer import torch import torch.nn.functional as F from langchain_community.vectorstores import Qdrant from qdrant_client import QdrantClient from qdrant_client.models import Distance, VectorParams from dotenv import load_dotenv # Load environment variables load_dotenv() def tiktoken_len(text): """Count tokens using the gpt-4o-mini tokenizer""" tokens = tiktoken.encoding_for_model("gpt-4o-mini").encode(text) return len(tokens) def add_page_info_to_splits(splits): """Process splits to add page info based on character position""" for split in splits: # Get the start position of this chunk start_pos = split.metadata.get("start_index", 0) end_pos = start_pos + len(split.page_content) # Find which page this chunk belongs to if "page_ranges" in split.metadata: for page_range in split.metadata["page_ranges"]: # If chunk significantly overlaps with this page range if (start_pos <= page_range["end"] and end_pos >= page_range["start"]): # Use this page number split.metadata["page"] = page_range["page"] break return splits def clean_directory(directory_path): """Clean a directory by removing all files and subdirectories""" path = Path(directory_path) if path.exists(): print(f"Cleaning directory: {directory_path}") shutil.rmtree(path) # Wait a moment to ensure OS releases the directory handles time.sleep(1) path.mkdir(parents=True, exist_ok=True) print(f"Created clean directory: {directory_path}") class ArcticEmbedder: def __init__(self, model_name): self.tokenizer = AutoTokenizer.from_pretrained(model_name) self.model = AutoModel.from_pretrained(model_name) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.model.to(self.device) def _mean_pooling(self, model_output, attention_mask): token_embeddings = model_output.last_hidden_state input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) def encode(self, texts, batch_size=32): all_embeddings = [] for i in range(0, len(texts), batch_size): batch = texts[i:i+batch_size] encoded_input = self.tokenizer( batch, padding=True, truncation=True, return_tensors="pt" ).to(self.device) with torch.no_grad(): model_output = self.model(**encoded_input) batch_embeddings = self._mean_pooling(model_output, encoded_input['attention_mask']) batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1) all_embeddings.append(batch_embeddings.cpu().numpy()) return np.concatenate(all_embeddings) def process_pdfs(): """Process PDFs and create vectorstore""" print("Processing PDFs...") # Create processed data directory if it doesn't exist (clean it if it does) processed_data_dir = Path("data/processed_data") clean_directory(processed_data_dir) # Load all PDF documents (each page as a separate document) pdf_path = "notebook_version_clean/data/" print(f"Loading PDFs from: {pdf_path}") loader = DirectoryLoader(pdf_path, glob="*.pdf", loader_cls=PyPDFLoader) all_docs = loader.load() print(f"Loaded {len(all_docs)} document pages.") # Create a mapping of merged document chunks back to original pages docs_by_source = defaultdict(list) # Group documents by their source file for doc in all_docs: source = doc.metadata.get("source", "") docs_by_source[source].append(doc) # Merge pages from the same PDF but track page ranges merged_docs = [] for source, source_docs in docs_by_source.items(): # Sort by page number if available source_docs.sort(key=lambda x: x.metadata.get("page", 0)) # Get just the filename (no path) filename = os.path.basename(source) # Merge the content merged_content = "" page_ranges = [] for doc in source_docs: # Get the page number (1-indexed for human readability) page_num = doc.metadata.get("page", 0) + 1 # Add a separator between pages for clarity if merged_content: merged_content += "\n\n" # Record where this page's content starts in the merged document start_pos = len(merged_content) merged_content += doc.page_content end_pos = len(merged_content) # Store the mapping of character ranges to original page numbers page_ranges.append({ "start": start_pos, "end": end_pos, "page": page_num, "source": filename }) # Create merged metadata that includes page mapping information merged_metadata = { "source": filename, "title": filename, "page_count": len(source_docs), "merged": True, "page_ranges": page_ranges # Store the page ranges for later reference } # Create a new document with the merged content merged_doc = Document(page_content=merged_content, metadata=merged_metadata) merged_docs.append(merged_doc) print(f"Created {len(merged_docs)} merged documents.") # Split documents text_splitter = RecursiveCharacterTextSplitter( chunk_size=150, chunk_overlap=100, length_function=tiktoken_len, add_start_index=True ) # Split and then process to add page information raw_splits = text_splitter.split_documents(merged_docs) split_chunks = add_page_info_to_splits(raw_splits) print(f"Created {len(split_chunks)} chunks.") # Save chunks for later use with open(processed_data_dir / "chunks.pkl", "wb") as f: pickle.dump(split_chunks, f) # Initialize custom embedding model try: embedding_model = ArcticEmbedder("kamkol/ab_testing_finetuned_arctic_ft-36dfff22-0696-40d2-b3bf-268fe2ff2aec") print("Successfully loaded ArcticEmbedder model") except Exception as e: print(f"Error loading model: {str(e)}") raise RuntimeError(f"Error initializing SentenceTransformer model: {str(e)}") print("Embedding document chunks (this may take a while)...") # Create a dictionary to store documents and their embeddings embedded_docs = [] # Embed in batches to avoid API rate limits batch_size = 50 for i in range(0, len(split_chunks), batch_size): batch = split_chunks[i:i+batch_size] # Extract text texts = [doc.page_content for doc in batch] # Get embeddings embeddings = embedding_model.encode(texts) # Store with metadata for j, doc in enumerate(batch): embedded_docs.append({ "id": i + j, "text": doc.page_content, "metadata": doc.metadata, "embedding": embeddings[j] }) # Print progress print(f"Embedded {min(i+batch_size, len(split_chunks))}/{len(split_chunks)} chunks") # Save the embedded docs for later use with open(processed_data_dir / "embedded_docs.pkl", "wb") as f: pickle.dump(embedded_docs, f) print("Processing complete. All data saved to data/processed_data/") if __name__ == "__main__": process_pdfs()