Spaces:

kamkol
/

AB_Testing_RAG_Agent_2_o

Sleeping

File size: 6,802 Bytes
import os
from collections import defaultdict
import tiktoken
import pickle
import shutil
import json
import time
import numpy as np
from pathlib import Path

from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Qdrant
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams

from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Ensure OpenAI API key is available
if not os.environ.get("OPENAI_API_KEY"):
    raise ValueError("OPENAI_API_KEY environment variable is not set.")

def tiktoken_len(text):
    """Count tokens using the gpt-4o-mini tokenizer"""
    tokens = tiktoken.encoding_for_model("gpt-4o-mini").encode(text)
    return len(tokens)

def add_page_info_to_splits(splits):
    """Process splits to add page info based on character position"""
    for split in splits:
        # Get the start position of this chunk
        start_pos = split.metadata.get("start_index", 0)
        end_pos = start_pos + len(split.page_content)
        
        # Find which page this chunk belongs to
        if "page_ranges" in split.metadata:
            for page_range in split.metadata["page_ranges"]:
                # If chunk significantly overlaps with this page range
                if (start_pos <= page_range["end"] and 
                    end_pos >= page_range["start"]):
                    # Use this page number
                    split.metadata["page"] = page_range["page"]
                    break
    return splits

def clean_directory(directory_path):
    """Clean a directory by removing all files and subdirectories"""
    path = Path(directory_path)
    if path.exists():
        print(f"Cleaning directory: {directory_path}")
        shutil.rmtree(path)
    
    # Wait a moment to ensure OS releases the directory handles
    time.sleep(1)
    
    path.mkdir(parents=True, exist_ok=True)
    print(f"Created clean directory: {directory_path}")

def process_pdfs():
    """Process PDFs and create vectorstore"""
    print("Processing PDFs...")
    
    # Create processed data directory if it doesn't exist (clean it if it does)
    processed_data_dir = Path("data/processed_data")
    clean_directory(processed_data_dir)
    
    # Load all PDF documents (each page as a separate document)
    pdf_path = "notebook_version/data/"
    print(f"Loading PDFs from: {pdf_path}")
    
    loader = DirectoryLoader(pdf_path, glob="*.pdf", loader_cls=PyPDFLoader)
    all_docs = loader.load()
    
    print(f"Loaded {len(all_docs)} document pages.")
    
    # Create a mapping of merged document chunks back to original pages
    docs_by_source = defaultdict(list)
    
    # Group documents by their source file
    for doc in all_docs:
        source = doc.metadata.get("source", "")
        docs_by_source[source].append(doc)
    
    # Merge pages from the same PDF but track page ranges
    merged_docs = []
    for source, source_docs in docs_by_source.items():
        # Sort by page number if available
        source_docs.sort(key=lambda x: x.metadata.get("page", 0))
        
        # Get just the filename (no path)
        filename = os.path.basename(source)
        
        # Merge the content
        merged_content = ""
        page_ranges = []
        
        for doc in source_docs:
            # Get the page number (1-indexed for human readability)
            page_num = doc.metadata.get("page", 0) + 1
            
            # Add a separator between pages for clarity
            if merged_content:
                merged_content += "\n\n"
            
            # Record where this page's content starts in the merged document
            start_pos = len(merged_content)
            merged_content += doc.page_content
            end_pos = len(merged_content)
            
            # Store the mapping of character ranges to original page numbers
            page_ranges.append({
                "start": start_pos,
                "end": end_pos,
                "page": page_num,
                "source": filename
            })
        
        # Create merged metadata that includes page mapping information
        merged_metadata = {
            "source": filename,
            "title": filename,
            "page_count": len(source_docs),
            "merged": True,
            "page_ranges": page_ranges  # Store the page ranges for later reference
        }
        
        # Create a new document with the merged content
        merged_doc = Document(page_content=merged_content, metadata=merged_metadata)
        merged_docs.append(merged_doc)
    
    print(f"Created {len(merged_docs)} merged documents.")
    
    # Split documents
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=50,
        length_function=tiktoken_len,
        add_start_index=True
    )
    
    # Split and then process to add page information
    raw_splits = text_splitter.split_documents(merged_docs)
    split_chunks = add_page_info_to_splits(raw_splits)
    
    print(f"Created {len(split_chunks)} chunks.")
    
    # Save chunks for later use
    with open(processed_data_dir / "chunks.pkl", "wb") as f:
        pickle.dump(split_chunks, f)
    
    # Initialize embedding model
    embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
    
    print("Embedding document chunks (this may take a while)...")
    # Create a dictionary to store documents and their embeddings
    embedded_docs = []
    
    # Embed in batches to avoid API rate limits
    batch_size = 50
    for i in range(0, len(split_chunks), batch_size):
        batch = split_chunks[i:i+batch_size]
        
        # Extract text
        texts = [doc.page_content for doc in batch]
        
        # Get embeddings
        embeddings = embedding_model.embed_documents(texts)
        
        # Store with metadata
        for j, doc in enumerate(batch):
            embedded_docs.append({
                "id": i + j,
                "text": doc.page_content,
                "metadata": doc.metadata,
                "embedding": embeddings[j]
            })
        
        # Print progress
        print(f"Embedded {min(i+batch_size, len(split_chunks))}/{len(split_chunks)} chunks")
    
    # Save the embedded docs for later use
    with open(processed_data_dir / "embedded_docs.pkl", "wb") as f:
        pickle.dump(embedded_docs, f)
    
    print("Processing complete. All data saved to data/processed_data/")

if __name__ == "__main__":
    process_pdfs()