import re import unicodedata import hashlib from pathlib import Path import ftfy import unidecode from langchain_google_genai import GoogleGenerativeAIEmbeddings from langchain_chroma import Chroma from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader import logging # Configure logging to show up in Docker logs logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def clean_text(text: str) -> str: """MNC-grade scrubbing for structural and encoding noise.""" # 1. Structural Scrubbing text = re.sub(r'Page\s+\d+\s+of\s+\d+', '', text, flags=re.IGNORECASE) text = re.sub(r'\b\d+\s*/\s*\d+\b', '', text) text = re.sub(r'^\s*-\s*\d+\s*-\s*$', '', text, flags=re.MULTILINE) text = re.sub(r'[-*_]{3,}', '', text) # 2. Encoding Repairs text = ftfy.fix_text(text) text = unidecode.unidecode(text) text = unicodedata.normalize('NFKC', text) # 3. Whitespace Normalization text = re.sub(r'[\t\xa0]', ' ', text) text = re.sub(r'(?<=[a-z])\n(?=[a-z])', ' ', text) # Fix mid-sentence breaks text = re.sub(r' +', ' ', text) return text.strip() def build_index(data_dir: str, persist_dir: str): """Processes messy data into a professional vector store.""" logger.info(f"Starting ingestion from: {data_dir}") loader = DirectoryLoader(data_dir, glob="**/*.pdf", loader_cls=PyPDFLoader) raw_docs = loader.load() logger.info(f"Loaded {len(raw_docs)} documents.") # Gemini 2025 standard embedding model embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004") splitter = RecursiveCharacterTextSplitter( chunk_size=1200, chunk_overlap=150, add_start_index=True ) final_chunks = [] for i, doc in enumerate(raw_docs): logger.info(f"Processing doc {i+1}/{len(raw_docs)}: {doc.metadata.get('source', 'unknown')}") cleaned_content = clean_text(doc.page_content) source_name = Path(doc.metadata.get("source", "unknown")).name # Metadata extraction for citations metadata = { "source": source_name, "page": doc.metadata.get("page", 1), "chunk_hash": hashlib.md5(cleaned_content.encode()).hexdigest() } chunks = splitter.create_documents([cleaned_content], metadatas=[metadata]) final_chunks.extend(chunks) logger.info(f"Total chunks created: {len(final_chunks)}") logger.info(f"Persisting to VectorDB at: {persist_dir}") vectorstore = Chroma.from_documents( documents=final_chunks, embedding=embeddings, persist_directory=persist_dir ) logger.info("VectorDB successfully built and persisted.") return vectorstore