File size: 2,972 Bytes
9806c71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import re
import unicodedata
import hashlib
from pathlib import Path
import ftfy
import unidecode
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader

import logging

# Configure logging to show up in Docker logs
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def clean_text(text: str) -> str:
    """MNC-grade scrubbing for structural and encoding noise."""
    # 1. Structural Scrubbing
    text = re.sub(r'Page\s+\d+\s+of\s+\d+', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\b\d+\s*/\s*\d+\b', '', text)
    text = re.sub(r'^\s*-\s*\d+\s*-\s*$', '', text, flags=re.MULTILINE)
    text = re.sub(r'[-*_]{3,}', '', text)
    
    # 2. Encoding Repairs
    text = ftfy.fix_text(text)
    text = unidecode.unidecode(text)
    text = unicodedata.normalize('NFKC', text)
    
    # 3. Whitespace Normalization
    text = re.sub(r'[\t\xa0]', ' ', text)
    text = re.sub(r'(?<=[a-z])\n(?=[a-z])', ' ', text) # Fix mid-sentence breaks
    text = re.sub(r' +', ' ', text)
    return text.strip()

def build_index(data_dir: str, persist_dir: str):
    """Processes messy data into a professional vector store."""
    logger.info(f"Starting ingestion from: {data_dir}")
    
    loader = DirectoryLoader(data_dir, glob="**/*.pdf", loader_cls=PyPDFLoader)
    raw_docs = loader.load()
    logger.info(f"Loaded {len(raw_docs)} documents.")
    
    # Gemini 2025 standard embedding model
    embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1200, chunk_overlap=150, add_start_index=True
    )
    
    final_chunks = []
    for i, doc in enumerate(raw_docs):
        logger.info(f"Processing doc {i+1}/{len(raw_docs)}: {doc.metadata.get('source', 'unknown')}")
        cleaned_content = clean_text(doc.page_content)
        source_name = Path(doc.metadata.get("source", "unknown")).name
        
        # Metadata extraction for citations
        metadata = {
            "source": source_name,
            "page": doc.metadata.get("page", 1),
            "chunk_hash": hashlib.md5(cleaned_content.encode()).hexdigest()
        }
        
        chunks = splitter.create_documents([cleaned_content], metadatas=[metadata])
        final_chunks.extend(chunks)
        
    logger.info(f"Total chunks created: {len(final_chunks)}")
    logger.info(f"Persisting to VectorDB at: {persist_dir}")
    
    vectorstore = Chroma.from_documents(
        documents=final_chunks, 
        embedding=embeddings, 
        persist_directory=persist_dir
    )
    logger.info("VectorDB successfully built and persisted.")
    return vectorstore